This commit is contained in:
Antoine JOUBREL 2024-02-05 21:04:02 +00:00
parent c96be82ffa
commit 6e0febdc41

View File

@ -71,7 +71,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 39, "execution_count": 4,
"id": "aaf64d60-bf92-470c-8210-d09abd6a653e", "id": "aaf64d60-bf92-470c-8210-d09abd6a653e",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -108,7 +108,7 @@
" 'bdc2324-data/1/1type_ofs.csv']" " 'bdc2324-data/1/1type_ofs.csv']"
] ]
}, },
"execution_count": 39, "execution_count": 4,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -119,7 +119,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 5,
"id": "0cb92854-903b-4efd-ac1b-197e29f044b4", "id": "0cb92854-903b-4efd-ac1b-197e29f044b4",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -143,7 +143,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 25,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -151,7 +151,7 @@
"name": "stderr", "name": "stderr",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"/tmp/ipykernel_9792/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", "/tmp/ipykernel_683/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n" " df = pd.read_csv(file_in)\n"
] ]
} }
@ -172,6 +172,717 @@
" globals()[nom_dataframe] = df" " globals()[nom_dataframe] = df"
] ]
}, },
{
"cell_type": "markdown",
"id": "f01e4530-1a61-49cb-a6b0-aa188cf1c0e0",
"metadata": {},
"source": [
"## customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "a01f993a-0f9f-4aed-bd23-bcdec9041bb3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 151866 entries, 0 to 151865\n",
"Data columns (total 29 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 151866 non-null int64 \n",
" 1 birthdate 5437 non-null object \n",
" 2 street_id 151866 non-null int64 \n",
" 3 civility 0 non-null float64\n",
" 4 is_partner 151866 non-null bool \n",
" 5 deleted_at 0 non-null float64\n",
" 6 gender 151866 non-null int64 \n",
" 7 is_email_true 151866 non-null bool \n",
" 8 opt_in 151866 non-null bool \n",
" 9 structure_id 18114 non-null float64\n",
" 10 note 906 non-null object \n",
" 11 profession 6206 non-null object \n",
" 12 language 1092 non-null object \n",
" 13 mcp_contact_id 98901 non-null float64\n",
" 14 last_buying_date 73422 non-null object \n",
" 15 max_price 73422 non-null float64\n",
" 16 ticket_sum 151866 non-null int64 \n",
" 17 average_price 138746 non-null float64\n",
" 18 fidelity 151866 non-null int64 \n",
" 19 average_purchase_delay 73422 non-null float64\n",
" 20 average_price_basket 73422 non-null float64\n",
" 21 average_ticket_basket 73422 non-null float64\n",
" 22 total_price 86542 non-null float64\n",
" 23 purchase_count 151866 non-null int64 \n",
" 24 first_buying_date 73422 non-null object \n",
" 25 last_visiting_date 0 non-null float64\n",
" 26 country 143575 non-null object \n",
" 27 age 5437 non-null float64\n",
" 28 tenant_id 151866 non-null int64 \n",
"dtypes: bool(3), float64(12), int64(7), object(7)\n",
"memory usage: 30.6+ MB\n"
]
}
],
"source": [
"a = pd.DataFrame(df1_customersplus.info())"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "45e82fc0-ba17-497b-9818-8be2bdc49d22",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "4bcdb081-c34f-4d51-b93f-abbb6fa49c5e",
"metadata": {},
"outputs": [],
"source": [
"a = info_colonnes_dataframe(df1_customersplus)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "319c814f-0956-4a92-9c0a-c6b9f53b04b5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Nom_colonne</th>\n",
" <th>Type_colonne</th>\n",
" <th>Taux_NA</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>id</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>lastname</td>\n",
" <td>object</td>\n",
" <td>43.461341</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>firstname</td>\n",
" <td>object</td>\n",
" <td>44.995588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>birthdate</td>\n",
" <td>object</td>\n",
" <td>96.419870</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>email</td>\n",
" <td>object</td>\n",
" <td>8.622075</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>street_id</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>created_at</td>\n",
" <td>object</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>updated_at</td>\n",
" <td>object</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>civility</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>is_partner</td>\n",
" <td>bool</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>extra</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>deleted_at</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>reference</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>gender</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>is_email_true</td>\n",
" <td>bool</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>extra_field</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>identifier</td>\n",
" <td>object</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>opt_in</td>\n",
" <td>bool</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>structure_id</td>\n",
" <td>float64</td>\n",
" <td>88.072380</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>note</td>\n",
" <td>object</td>\n",
" <td>99.403421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>profession</td>\n",
" <td>object</td>\n",
" <td>95.913503</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>language</td>\n",
" <td>object</td>\n",
" <td>99.280945</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>mcp_contact_id</td>\n",
" <td>float64</td>\n",
" <td>34.876141</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>need_reload</td>\n",
" <td>bool</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>last_buying_date</td>\n",
" <td>object</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>max_price</td>\n",
" <td>float64</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>ticket_sum</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>average_price</td>\n",
" <td>float64</td>\n",
" <td>8.639195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>fidelity</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>average_purchase_delay</td>\n",
" <td>float64</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>average_price_basket</td>\n",
" <td>float64</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>average_ticket_basket</td>\n",
" <td>float64</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>total_price</td>\n",
" <td>float64</td>\n",
" <td>43.014236</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>preferred_category</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>preferred_supplier</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>preferred_formula</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>purchase_count</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>first_buying_date</td>\n",
" <td>object</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>last_visiting_date</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>zipcode</td>\n",
" <td>object</td>\n",
" <td>71.176564</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>country</td>\n",
" <td>object</td>\n",
" <td>5.459418</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>age</td>\n",
" <td>float64</td>\n",
" <td>96.419870</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>tenant_id</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Nom_colonne Type_colonne Taux_NA\n",
"0 id int64 0.000000\n",
"1 lastname object 43.461341\n",
"2 firstname object 44.995588\n",
"3 birthdate object 96.419870\n",
"4 email object 8.622075\n",
"5 street_id int64 0.000000\n",
"6 created_at object 0.000000\n",
"7 updated_at object 0.000000\n",
"8 civility float64 100.000000\n",
"9 is_partner bool 0.000000\n",
"10 extra float64 100.000000\n",
"11 deleted_at float64 100.000000\n",
"12 reference float64 100.000000\n",
"13 gender int64 0.000000\n",
"14 is_email_true bool 0.000000\n",
"15 extra_field float64 100.000000\n",
"16 identifier object 0.000000\n",
"17 opt_in bool 0.000000\n",
"18 structure_id float64 88.072380\n",
"19 note object 99.403421\n",
"20 profession object 95.913503\n",
"21 language object 99.280945\n",
"22 mcp_contact_id float64 34.876141\n",
"23 need_reload bool 0.000000\n",
"24 last_buying_date object 51.653431\n",
"25 max_price float64 51.653431\n",
"26 ticket_sum int64 0.000000\n",
"27 average_price float64 8.639195\n",
"28 fidelity int64 0.000000\n",
"29 average_purchase_delay float64 51.653431\n",
"30 average_price_basket float64 51.653431\n",
"31 average_ticket_basket float64 51.653431\n",
"32 total_price float64 43.014236\n",
"33 preferred_category float64 100.000000\n",
"34 preferred_supplier float64 100.000000\n",
"35 preferred_formula float64 100.000000\n",
"36 purchase_count int64 0.000000\n",
"37 first_buying_date object 51.653431\n",
"38 last_visiting_date float64 100.000000\n",
"39 zipcode object 71.176564\n",
"40 country object 5.459418\n",
"41 age float64 96.419870\n",
"42 tenant_id int64 0.000000"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "e54a1170-2b10-4b22-8241-e7f5ec3fce75",
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "5c997ff6-251b-4e7f-8946-a8b722f5e97f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>birthdate</th>\n",
" <th>street_id</th>\n",
" <th>is_partner</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>note</th>\n",
" <th>profession</th>\n",
" <th>...</th>\n",
" <th>fidelity</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>12751</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>12825</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11261</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13071</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>653061</td>\n",
" <td>NaN</td>\n",
" <td>10</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
"0 12751 NaN 2 False 1 True \n",
"1 12825 NaN 2 False 2 True \n",
"2 11261 NaN 2 False 1 True \n",
"3 13071 NaN 2 False 2 True \n",
"4 653061 NaN 10 False 2 True \n",
"\n",
" opt_in structure_id note profession ... fidelity average_purchase_delay \\\n",
"0 True NaN NaN NaN ... 0 NaN \n",
"1 True NaN NaN NaN ... 0 NaN \n",
"2 True NaN NaN NaN ... 0 NaN \n",
"3 True NaN NaN NaN ... 0 NaN \n",
"4 False NaN NaN NaN ... 0 NaN \n",
"\n",
" average_price_basket average_ticket_basket total_price purchase_count \\\n",
"0 NaN NaN NaN 0 \n",
"1 NaN NaN NaN 0 \n",
"2 NaN NaN NaN 0 \n",
"3 NaN NaN NaN 0 \n",
"4 NaN NaN NaN 0 \n",
"\n",
" first_buying_date country age tenant_id \n",
"0 NaT fr NaN 1311 \n",
"1 NaT fr NaN 1311 \n",
"2 NaT fr NaN 1311 \n",
"3 NaT fr NaN 1311 \n",
"4 NaT NaN NaN 1311 \n",
"\n",
"[5 rows x 26 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Selection des variables\n",
"df1_customersplus_clean = df1_customersplus.copy()\n",
"\n",
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
"\n",
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f", "id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f",
@ -534,16 +1245,14 @@
], ],
"source": [ "source": [
"# Selection des variables\n", "# Selection des variables\n",
"df1_tickets_clean = df1_tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", "df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
"df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)" "df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52", "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52",
"metadata": { "metadata": {},
"jp-MarkdownHeadingCollapsed": true
},
"source": [ "source": [
"## suppliers.csv" "## suppliers.csv"
] ]
@ -1739,31 +2448,6 @@
"# Utilisation de fonctions" "# Utilisation de fonctions"
] ]
}, },
{
"cell_type": "code",
"execution_count": 50,
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 51, "execution_count": 51,