update
This commit is contained in:
parent
c96be82ffa
commit
6e0febdc41
|
@ -71,7 +71,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"execution_count": 4,
|
||||
"id": "aaf64d60-bf92-470c-8210-d09abd6a653e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -108,7 +108,7 @@
|
|||
" 'bdc2324-data/1/1type_ofs.csv']"
|
||||
]
|
||||
},
|
||||
"execution_count": 39,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -119,7 +119,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 5,
|
||||
"id": "0cb92854-903b-4efd-ac1b-197e29f044b4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -143,7 +143,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 25,
|
||||
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -151,7 +151,7 @@
|
|||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/tmp/ipykernel_9792/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
"/tmp/ipykernel_683/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
" df = pd.read_csv(file_in)\n"
|
||||
]
|
||||
}
|
||||
|
@ -172,6 +172,717 @@
|
|||
" globals()[nom_dataframe] = df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f01e4530-1a61-49cb-a6b0-aa188cf1c0e0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## customersplus.csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "a01f993a-0f9f-4aed-bd23-bcdec9041bb3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 151866 entries, 0 to 151865\n",
|
||||
"Data columns (total 29 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 id 151866 non-null int64 \n",
|
||||
" 1 birthdate 5437 non-null object \n",
|
||||
" 2 street_id 151866 non-null int64 \n",
|
||||
" 3 civility 0 non-null float64\n",
|
||||
" 4 is_partner 151866 non-null bool \n",
|
||||
" 5 deleted_at 0 non-null float64\n",
|
||||
" 6 gender 151866 non-null int64 \n",
|
||||
" 7 is_email_true 151866 non-null bool \n",
|
||||
" 8 opt_in 151866 non-null bool \n",
|
||||
" 9 structure_id 18114 non-null float64\n",
|
||||
" 10 note 906 non-null object \n",
|
||||
" 11 profession 6206 non-null object \n",
|
||||
" 12 language 1092 non-null object \n",
|
||||
" 13 mcp_contact_id 98901 non-null float64\n",
|
||||
" 14 last_buying_date 73422 non-null object \n",
|
||||
" 15 max_price 73422 non-null float64\n",
|
||||
" 16 ticket_sum 151866 non-null int64 \n",
|
||||
" 17 average_price 138746 non-null float64\n",
|
||||
" 18 fidelity 151866 non-null int64 \n",
|
||||
" 19 average_purchase_delay 73422 non-null float64\n",
|
||||
" 20 average_price_basket 73422 non-null float64\n",
|
||||
" 21 average_ticket_basket 73422 non-null float64\n",
|
||||
" 22 total_price 86542 non-null float64\n",
|
||||
" 23 purchase_count 151866 non-null int64 \n",
|
||||
" 24 first_buying_date 73422 non-null object \n",
|
||||
" 25 last_visiting_date 0 non-null float64\n",
|
||||
" 26 country 143575 non-null object \n",
|
||||
" 27 age 5437 non-null float64\n",
|
||||
" 28 tenant_id 151866 non-null int64 \n",
|
||||
"dtypes: bool(3), float64(12), int64(7), object(7)\n",
|
||||
"memory usage: 30.6+ MB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = pd.DataFrame(df1_customersplus.info())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"id": "45e82fc0-ba17-497b-9818-8be2bdc49d22",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def info_colonnes_dataframe(df):\n",
|
||||
" # Créer une liste pour stocker les informations sur chaque colonne\n",
|
||||
" infos_colonnes = []\n",
|
||||
"\n",
|
||||
" # Parcourir les colonnes du DataFrame\n",
|
||||
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
|
||||
" # Calculer le taux de valeurs manquantes\n",
|
||||
" taux_na = serie.isna().mean() * 100\n",
|
||||
"\n",
|
||||
" # Ajouter les informations à la liste\n",
|
||||
" infos_colonnes.append({\n",
|
||||
" 'Nom_colonne': nom_colonne,\n",
|
||||
" 'Type_colonne': str(serie.dtype),\n",
|
||||
" 'Taux_NA': taux_na\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
|
||||
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
|
||||
"\n",
|
||||
" return df_infos_colonnes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def cleaning_date(df, column_name):\n",
|
||||
" \"\"\"\n",
|
||||
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
|
||||
"\n",
|
||||
" Parameters:\n",
|
||||
" - df: DataFrame\n",
|
||||
" Le DataFrame contenant la colonne à nettoyer.\n",
|
||||
" - column_name: str\n",
|
||||
" Le nom de la colonne à nettoyer.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" - DataFrame\n",
|
||||
" Le DataFrame modifié avec la colonne nettoyée.\n",
|
||||
" \"\"\"\n",
|
||||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
||||
" return df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "4bcdb081-c34f-4d51-b93f-abbb6fa49c5e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a = info_colonnes_dataframe(df1_customersplus)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "319c814f-0956-4a92-9c0a-c6b9f53b04b5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Nom_colonne</th>\n",
|
||||
" <th>Type_colonne</th>\n",
|
||||
" <th>Taux_NA</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>id</td>\n",
|
||||
" <td>int64</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>lastname</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>43.461341</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>firstname</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>44.995588</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>birthdate</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>96.419870</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>email</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>8.622075</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>street_id</td>\n",
|
||||
" <td>int64</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>created_at</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>updated_at</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>civility</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>100.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>is_partner</td>\n",
|
||||
" <td>bool</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>extra</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>100.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>deleted_at</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>100.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>reference</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>100.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>gender</td>\n",
|
||||
" <td>int64</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>is_email_true</td>\n",
|
||||
" <td>bool</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>15</th>\n",
|
||||
" <td>extra_field</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>100.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>16</th>\n",
|
||||
" <td>identifier</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17</th>\n",
|
||||
" <td>opt_in</td>\n",
|
||||
" <td>bool</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18</th>\n",
|
||||
" <td>structure_id</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>88.072380</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>19</th>\n",
|
||||
" <td>note</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>99.403421</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>20</th>\n",
|
||||
" <td>profession</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>95.913503</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>21</th>\n",
|
||||
" <td>language</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>99.280945</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>22</th>\n",
|
||||
" <td>mcp_contact_id</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>34.876141</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>23</th>\n",
|
||||
" <td>need_reload</td>\n",
|
||||
" <td>bool</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>24</th>\n",
|
||||
" <td>last_buying_date</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>51.653431</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25</th>\n",
|
||||
" <td>max_price</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>51.653431</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>26</th>\n",
|
||||
" <td>ticket_sum</td>\n",
|
||||
" <td>int64</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>27</th>\n",
|
||||
" <td>average_price</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>8.639195</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>28</th>\n",
|
||||
" <td>fidelity</td>\n",
|
||||
" <td>int64</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>29</th>\n",
|
||||
" <td>average_purchase_delay</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>51.653431</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>30</th>\n",
|
||||
" <td>average_price_basket</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>51.653431</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>31</th>\n",
|
||||
" <td>average_ticket_basket</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>51.653431</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>32</th>\n",
|
||||
" <td>total_price</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>43.014236</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>33</th>\n",
|
||||
" <td>preferred_category</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>100.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>34</th>\n",
|
||||
" <td>preferred_supplier</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>100.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>35</th>\n",
|
||||
" <td>preferred_formula</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>100.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>36</th>\n",
|
||||
" <td>purchase_count</td>\n",
|
||||
" <td>int64</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>37</th>\n",
|
||||
" <td>first_buying_date</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>51.653431</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>38</th>\n",
|
||||
" <td>last_visiting_date</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>100.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>39</th>\n",
|
||||
" <td>zipcode</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>71.176564</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>40</th>\n",
|
||||
" <td>country</td>\n",
|
||||
" <td>object</td>\n",
|
||||
" <td>5.459418</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>41</th>\n",
|
||||
" <td>age</td>\n",
|
||||
" <td>float64</td>\n",
|
||||
" <td>96.419870</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>42</th>\n",
|
||||
" <td>tenant_id</td>\n",
|
||||
" <td>int64</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Nom_colonne Type_colonne Taux_NA\n",
|
||||
"0 id int64 0.000000\n",
|
||||
"1 lastname object 43.461341\n",
|
||||
"2 firstname object 44.995588\n",
|
||||
"3 birthdate object 96.419870\n",
|
||||
"4 email object 8.622075\n",
|
||||
"5 street_id int64 0.000000\n",
|
||||
"6 created_at object 0.000000\n",
|
||||
"7 updated_at object 0.000000\n",
|
||||
"8 civility float64 100.000000\n",
|
||||
"9 is_partner bool 0.000000\n",
|
||||
"10 extra float64 100.000000\n",
|
||||
"11 deleted_at float64 100.000000\n",
|
||||
"12 reference float64 100.000000\n",
|
||||
"13 gender int64 0.000000\n",
|
||||
"14 is_email_true bool 0.000000\n",
|
||||
"15 extra_field float64 100.000000\n",
|
||||
"16 identifier object 0.000000\n",
|
||||
"17 opt_in bool 0.000000\n",
|
||||
"18 structure_id float64 88.072380\n",
|
||||
"19 note object 99.403421\n",
|
||||
"20 profession object 95.913503\n",
|
||||
"21 language object 99.280945\n",
|
||||
"22 mcp_contact_id float64 34.876141\n",
|
||||
"23 need_reload bool 0.000000\n",
|
||||
"24 last_buying_date object 51.653431\n",
|
||||
"25 max_price float64 51.653431\n",
|
||||
"26 ticket_sum int64 0.000000\n",
|
||||
"27 average_price float64 8.639195\n",
|
||||
"28 fidelity int64 0.000000\n",
|
||||
"29 average_purchase_delay float64 51.653431\n",
|
||||
"30 average_price_basket float64 51.653431\n",
|
||||
"31 average_ticket_basket float64 51.653431\n",
|
||||
"32 total_price float64 43.014236\n",
|
||||
"33 preferred_category float64 100.000000\n",
|
||||
"34 preferred_supplier float64 100.000000\n",
|
||||
"35 preferred_formula float64 100.000000\n",
|
||||
"36 purchase_count int64 0.000000\n",
|
||||
"37 first_buying_date object 51.653431\n",
|
||||
"38 last_visiting_date float64 100.000000\n",
|
||||
"39 zipcode object 71.176564\n",
|
||||
"40 country object 5.459418\n",
|
||||
"41 age float64 96.419870\n",
|
||||
"42 tenant_id int64 0.000000"
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "e54a1170-2b10-4b22-8241-e7f5ec3fce75",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "5c997ff6-251b-4e7f-8946-a8b722f5e97f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>customer_id</th>\n",
|
||||
" <th>birthdate</th>\n",
|
||||
" <th>street_id</th>\n",
|
||||
" <th>is_partner</th>\n",
|
||||
" <th>gender</th>\n",
|
||||
" <th>is_email_true</th>\n",
|
||||
" <th>opt_in</th>\n",
|
||||
" <th>structure_id</th>\n",
|
||||
" <th>note</th>\n",
|
||||
" <th>profession</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>fidelity</th>\n",
|
||||
" <th>average_purchase_delay</th>\n",
|
||||
" <th>average_price_basket</th>\n",
|
||||
" <th>average_ticket_basket</th>\n",
|
||||
" <th>total_price</th>\n",
|
||||
" <th>purchase_count</th>\n",
|
||||
" <th>first_buying_date</th>\n",
|
||||
" <th>country</th>\n",
|
||||
" <th>age</th>\n",
|
||||
" <th>tenant_id</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>12751</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>fr</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1311</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>12825</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>fr</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1311</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>11261</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>fr</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1311</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>13071</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>fr</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1311</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>653061</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>10</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>NaT</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>1311</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 26 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
|
||||
"0 12751 NaN 2 False 1 True \n",
|
||||
"1 12825 NaN 2 False 2 True \n",
|
||||
"2 11261 NaN 2 False 1 True \n",
|
||||
"3 13071 NaN 2 False 2 True \n",
|
||||
"4 653061 NaN 10 False 2 True \n",
|
||||
"\n",
|
||||
" opt_in structure_id note profession ... fidelity average_purchase_delay \\\n",
|
||||
"0 True NaN NaN NaN ... 0 NaN \n",
|
||||
"1 True NaN NaN NaN ... 0 NaN \n",
|
||||
"2 True NaN NaN NaN ... 0 NaN \n",
|
||||
"3 True NaN NaN NaN ... 0 NaN \n",
|
||||
"4 False NaN NaN NaN ... 0 NaN \n",
|
||||
"\n",
|
||||
" average_price_basket average_ticket_basket total_price purchase_count \\\n",
|
||||
"0 NaN NaN NaN 0 \n",
|
||||
"1 NaN NaN NaN 0 \n",
|
||||
"2 NaN NaN NaN 0 \n",
|
||||
"3 NaN NaN NaN 0 \n",
|
||||
"4 NaN NaN NaN 0 \n",
|
||||
"\n",
|
||||
" first_buying_date country age tenant_id \n",
|
||||
"0 NaT fr NaN 1311 \n",
|
||||
"1 NaT fr NaN 1311 \n",
|
||||
"2 NaT fr NaN 1311 \n",
|
||||
"3 NaT fr NaN 1311 \n",
|
||||
"4 NaT NaN NaN 1311 \n",
|
||||
"\n",
|
||||
"[5 rows x 26 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Selection des variables\n",
|
||||
"df1_customersplus_clean = df1_customersplus.copy()\n",
|
||||
"\n",
|
||||
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
|
||||
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
|
||||
"\n",
|
||||
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
|
||||
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f",
|
||||
|
@ -534,16 +1245,14 @@
|
|||
],
|
||||
"source": [
|
||||
"# Selection des variables\n",
|
||||
"df1_tickets_clean = df1_tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
|
||||
"df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)"
|
||||
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
|
||||
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true
|
||||
},
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## suppliers.csv"
|
||||
]
|
||||
|
@ -1739,31 +2448,6 @@
|
|||
"# Utilisation de fonctions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def cleaning_date(df, column_name):\n",
|
||||
" \"\"\"\n",
|
||||
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
|
||||
"\n",
|
||||
" Parameters:\n",
|
||||
" - df: DataFrame\n",
|
||||
" Le DataFrame contenant la colonne à nettoyer.\n",
|
||||
" - column_name: str\n",
|
||||
" Le nom de la colonne à nettoyer.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" - DataFrame\n",
|
||||
" Le DataFrame modifié avec la colonne nettoyée.\n",
|
||||
" \"\"\"\n",
|
||||
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
|
||||
" return df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
|
|
Loading…
Reference in New Issue
Block a user