BDC-team-1/Exploration_billet_AJ.ipynb

4064 lines
145 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re"
]
},
{
"cell_type": "markdown",
"id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "markdown",
"id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
"metadata": {},
"source": [
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
"id": "9d74b68f-ba07-4a15-9a27-dae931762d70",
"metadata": {},
"source": [
"## Chargement données"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "aaf64d60-bf92-470c-8210-d09abd6a653e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/1/1campaign_stats.csv',\n",
" 'bdc2324-data/1/1campaigns.csv',\n",
" 'bdc2324-data/1/1categories.csv',\n",
" 'bdc2324-data/1/1countries.csv',\n",
" 'bdc2324-data/1/1currencies.csv',\n",
" 'bdc2324-data/1/1customer_target_mappings.csv',\n",
" 'bdc2324-data/1/1customersplus.csv',\n",
" 'bdc2324-data/1/1event_types.csv',\n",
" 'bdc2324-data/1/1events.csv',\n",
" 'bdc2324-data/1/1facilities.csv',\n",
" 'bdc2324-data/1/1link_stats.csv',\n",
" 'bdc2324-data/1/1pricing_formulas.csv',\n",
" 'bdc2324-data/1/1product_packs.csv',\n",
" 'bdc2324-data/1/1products.csv',\n",
" 'bdc2324-data/1/1products_groups.csv',\n",
" 'bdc2324-data/1/1purchases.csv',\n",
" 'bdc2324-data/1/1representation_category_capacities.csv',\n",
" 'bdc2324-data/1/1representations.csv',\n",
" 'bdc2324-data/1/1seasons.csv',\n",
" 'bdc2324-data/1/1structure_tag_mappings.csv',\n",
" 'bdc2324-data/1/1suppliers.csv',\n",
" 'bdc2324-data/1/1tags.csv',\n",
" 'bdc2324-data/1/1target_types.csv',\n",
" 'bdc2324-data/1/1targets.csv',\n",
" 'bdc2324-data/1/1tickets.csv',\n",
" 'bdc2324-data/1/1type_of_categories.csv',\n",
" 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n",
" 'bdc2324-data/1/1type_ofs.csv']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"liste_database"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "0cb92854-903b-4efd-ac1b-197e29f044b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1tickets.csv', 'bdc2324-data/1/1type_ofs.csv']\n"
]
}
],
"source": [
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_database_filtered)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n"
]
}
],
"source": [
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
{
"cell_type": "markdown",
"id": "f01e4530-1a61-49cb-a6b0-aa188cf1c0e0",
"metadata": {},
"source": [
"## customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "a01f993a-0f9f-4aed-bd23-bcdec9041bb3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 151866 entries, 0 to 151865\n",
"Data columns (total 29 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 151866 non-null int64 \n",
" 1 birthdate 5437 non-null object \n",
" 2 street_id 151866 non-null int64 \n",
" 3 civility 0 non-null float64\n",
" 4 is_partner 151866 non-null bool \n",
" 5 deleted_at 0 non-null float64\n",
" 6 gender 151866 non-null int64 \n",
" 7 is_email_true 151866 non-null bool \n",
" 8 opt_in 151866 non-null bool \n",
" 9 structure_id 18114 non-null float64\n",
" 10 note 906 non-null object \n",
" 11 profession 6206 non-null object \n",
" 12 language 1092 non-null object \n",
" 13 mcp_contact_id 98901 non-null float64\n",
" 14 last_buying_date 73422 non-null object \n",
" 15 max_price 73422 non-null float64\n",
" 16 ticket_sum 151866 non-null int64 \n",
" 17 average_price 138746 non-null float64\n",
" 18 fidelity 151866 non-null int64 \n",
" 19 average_purchase_delay 73422 non-null float64\n",
" 20 average_price_basket 73422 non-null float64\n",
" 21 average_ticket_basket 73422 non-null float64\n",
" 22 total_price 86542 non-null float64\n",
" 23 purchase_count 151866 non-null int64 \n",
" 24 first_buying_date 73422 non-null object \n",
" 25 last_visiting_date 0 non-null float64\n",
" 26 country 143575 non-null object \n",
" 27 age 5437 non-null float64\n",
" 28 tenant_id 151866 non-null int64 \n",
"dtypes: bool(3), float64(12), int64(7), object(7)\n",
"memory usage: 30.6+ MB\n"
]
}
],
"source": [
"a = pd.DataFrame(df1_customersplus.info())"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "45e82fc0-ba17-497b-9818-8be2bdc49d22",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "4bcdb081-c34f-4d51-b93f-abbb6fa49c5e",
"metadata": {},
"outputs": [],
"source": [
"a = info_colonnes_dataframe(df1_customersplus)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "319c814f-0956-4a92-9c0a-c6b9f53b04b5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Nom_colonne</th>\n",
" <th>Type_colonne</th>\n",
" <th>Taux_NA</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>id</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>lastname</td>\n",
" <td>object</td>\n",
" <td>43.461341</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>firstname</td>\n",
" <td>object</td>\n",
" <td>44.995588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>birthdate</td>\n",
" <td>object</td>\n",
" <td>96.419870</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>email</td>\n",
" <td>object</td>\n",
" <td>8.622075</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>street_id</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>created_at</td>\n",
" <td>object</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>updated_at</td>\n",
" <td>object</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>civility</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>is_partner</td>\n",
" <td>bool</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>extra</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>deleted_at</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>reference</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>gender</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>is_email_true</td>\n",
" <td>bool</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>extra_field</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>identifier</td>\n",
" <td>object</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>opt_in</td>\n",
" <td>bool</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>structure_id</td>\n",
" <td>float64</td>\n",
" <td>88.072380</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>note</td>\n",
" <td>object</td>\n",
" <td>99.403421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>profession</td>\n",
" <td>object</td>\n",
" <td>95.913503</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>language</td>\n",
" <td>object</td>\n",
" <td>99.280945</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>mcp_contact_id</td>\n",
" <td>float64</td>\n",
" <td>34.876141</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>need_reload</td>\n",
" <td>bool</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>last_buying_date</td>\n",
" <td>object</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>max_price</td>\n",
" <td>float64</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>ticket_sum</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>average_price</td>\n",
" <td>float64</td>\n",
" <td>8.639195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>fidelity</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>average_purchase_delay</td>\n",
" <td>float64</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>average_price_basket</td>\n",
" <td>float64</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>average_ticket_basket</td>\n",
" <td>float64</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>total_price</td>\n",
" <td>float64</td>\n",
" <td>43.014236</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>preferred_category</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>preferred_supplier</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>preferred_formula</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>purchase_count</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>first_buying_date</td>\n",
" <td>object</td>\n",
" <td>51.653431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>last_visiting_date</td>\n",
" <td>float64</td>\n",
" <td>100.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>zipcode</td>\n",
" <td>object</td>\n",
" <td>71.176564</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>country</td>\n",
" <td>object</td>\n",
" <td>5.459418</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>age</td>\n",
" <td>float64</td>\n",
" <td>96.419870</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>tenant_id</td>\n",
" <td>int64</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Nom_colonne Type_colonne Taux_NA\n",
"0 id int64 0.000000\n",
"1 lastname object 43.461341\n",
"2 firstname object 44.995588\n",
"3 birthdate object 96.419870\n",
"4 email object 8.622075\n",
"5 street_id int64 0.000000\n",
"6 created_at object 0.000000\n",
"7 updated_at object 0.000000\n",
"8 civility float64 100.000000\n",
"9 is_partner bool 0.000000\n",
"10 extra float64 100.000000\n",
"11 deleted_at float64 100.000000\n",
"12 reference float64 100.000000\n",
"13 gender int64 0.000000\n",
"14 is_email_true bool 0.000000\n",
"15 extra_field float64 100.000000\n",
"16 identifier object 0.000000\n",
"17 opt_in bool 0.000000\n",
"18 structure_id float64 88.072380\n",
"19 note object 99.403421\n",
"20 profession object 95.913503\n",
"21 language object 99.280945\n",
"22 mcp_contact_id float64 34.876141\n",
"23 need_reload bool 0.000000\n",
"24 last_buying_date object 51.653431\n",
"25 max_price float64 51.653431\n",
"26 ticket_sum int64 0.000000\n",
"27 average_price float64 8.639195\n",
"28 fidelity int64 0.000000\n",
"29 average_purchase_delay float64 51.653431\n",
"30 average_price_basket float64 51.653431\n",
"31 average_ticket_basket float64 51.653431\n",
"32 total_price float64 43.014236\n",
"33 preferred_category float64 100.000000\n",
"34 preferred_supplier float64 100.000000\n",
"35 preferred_formula float64 100.000000\n",
"36 purchase_count int64 0.000000\n",
"37 first_buying_date object 51.653431\n",
"38 last_visiting_date float64 100.000000\n",
"39 zipcode object 71.176564\n",
"40 country object 5.459418\n",
"41 age float64 96.419870\n",
"42 tenant_id int64 0.000000"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "e54a1170-2b10-4b22-8241-e7f5ec3fce75",
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "5c997ff6-251b-4e7f-8946-a8b722f5e97f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>birthdate</th>\n",
" <th>street_id</th>\n",
" <th>is_partner</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>note</th>\n",
" <th>profession</th>\n",
" <th>...</th>\n",
" <th>fidelity</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>12751</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>12825</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11261</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13071</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>653061</td>\n",
" <td>NaN</td>\n",
" <td>10</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
"0 12751 NaN 2 False 1 True \n",
"1 12825 NaN 2 False 2 True \n",
"2 11261 NaN 2 False 1 True \n",
"3 13071 NaN 2 False 2 True \n",
"4 653061 NaN 10 False 2 True \n",
"\n",
" opt_in structure_id note profession ... fidelity average_purchase_delay \\\n",
"0 True NaN NaN NaN ... 0 NaN \n",
"1 True NaN NaN NaN ... 0 NaN \n",
"2 True NaN NaN NaN ... 0 NaN \n",
"3 True NaN NaN NaN ... 0 NaN \n",
"4 False NaN NaN NaN ... 0 NaN \n",
"\n",
" average_price_basket average_ticket_basket total_price purchase_count \\\n",
"0 NaN NaN NaN 0 \n",
"1 NaN NaN NaN 0 \n",
"2 NaN NaN NaN 0 \n",
"3 NaN NaN NaN 0 \n",
"4 NaN NaN NaN 0 \n",
"\n",
" first_buying_date country age tenant_id \n",
"0 NaT fr NaN 1311 \n",
"1 NaT fr NaN 1311 \n",
"2 NaT fr NaN 1311 \n",
"3 NaT fr NaN 1311 \n",
"4 NaT NaN NaN 1311 \n",
"\n",
"[5 rows x 26 columns]"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Selection des variables\n",
"df1_customersplus_clean = df1_customersplus.copy()\n",
"\n",
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
"\n",
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## tickets.csv"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "14f4158e-c9c0-4beb-826a-5e0f949434a4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>number</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>purchase_id</th>\n",
" <th>product_id</th>\n",
" <th>is_from_subscription</th>\n",
" <th>type_of</th>\n",
" <th>supplier_id</th>\n",
" <th>barcode</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13070859</td>\n",
" <td>13593002661288</td>\n",
" <td>2021-12-28 20:47:10.320641+01:00</td>\n",
" <td>2022-02-14 18:46:53.614229+01:00</td>\n",
" <td>5107462</td>\n",
" <td>225251</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>b6ad7fc36f33b5e05f58c7fca06688a6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13070860</td>\n",
" <td>13593002661399</td>\n",
" <td>2021-12-28 20:47:10.321037+01:00</td>\n",
" <td>2022-02-14 18:46:53.614761+01:00</td>\n",
" <td>5107462</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>b0903af480266f27802fe5c38c277c9e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13070861</td>\n",
" <td>13593002661419</td>\n",
" <td>2021-12-28 20:47:10.321629+01:00</td>\n",
" <td>2022-02-14 18:46:53.615521+01:00</td>\n",
" <td>5107462</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>64ca12b7e26a65b90335c0702ea0faba</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13070862</td>\n",
" <td>13593002661508</td>\n",
" <td>2021-12-28 20:47:10.322029+01:00</td>\n",
" <td>2022-02-14 18:46:53.616000+01:00</td>\n",
" <td>5107462</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>5ac2f8150aa9f3a6b1599df08cc2f0c7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13070863</td>\n",
" <td>13593002661689</td>\n",
" <td>2021-12-28 20:47:10.322449+01:00</td>\n",
" <td>2022-02-14 18:46:53.616447+01:00</td>\n",
" <td>5107462</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>dfe30081bae020d12094279926136b9c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826667</th>\n",
" <td>20662815</td>\n",
" <td>13593016154390</td>\n",
" <td>2023-11-09 07:51:34.935983+01:00</td>\n",
" <td>2023-11-09 07:51:34.935983+01:00</td>\n",
" <td>8007697</td>\n",
" <td>405689</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>dba9aa428f843b79ae69dfacfe8fc579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826668</th>\n",
" <td>20662816</td>\n",
" <td>13593016154501</td>\n",
" <td>2023-11-09 07:51:34.937038+01:00</td>\n",
" <td>2023-11-09 07:51:34.937038+01:00</td>\n",
" <td>8007698</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>93f1fcfc6ba4fa68f92eb4b4a619fcf0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826669</th>\n",
" <td>20662817</td>\n",
" <td>13593016154680</td>\n",
" <td>2023-11-09 07:51:34.938224+01:00</td>\n",
" <td>2023-11-09 07:51:34.938224+01:00</td>\n",
" <td>8007698</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>c8bbbd25df2c158767ceef42c3237f23</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826670</th>\n",
" <td>20662818</td>\n",
" <td>13593016154899</td>\n",
" <td>2023-11-09 07:51:34.939328+01:00</td>\n",
" <td>2023-11-09 07:51:34.939328+01:00</td>\n",
" <td>8007699</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>738f0a8b5088b5056bc3b32eff2dca1f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826671</th>\n",
" <td>20662819</td>\n",
" <td>13593016154988</td>\n",
" <td>2023-11-09 07:51:34.940680+01:00</td>\n",
" <td>2023-11-09 07:51:34.940680+01:00</td>\n",
" <td>8007699</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>4c5a6195434377380b4e6ae63b2e9cf6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1826672 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" id number created_at \\\n",
"0 13070859 13593002661288 2021-12-28 20:47:10.320641+01:00 \n",
"1 13070860 13593002661399 2021-12-28 20:47:10.321037+01:00 \n",
"2 13070861 13593002661419 2021-12-28 20:47:10.321629+01:00 \n",
"3 13070862 13593002661508 2021-12-28 20:47:10.322029+01:00 \n",
"4 13070863 13593002661689 2021-12-28 20:47:10.322449+01:00 \n",
"... ... ... ... \n",
"1826667 20662815 13593016154390 2023-11-09 07:51:34.935983+01:00 \n",
"1826668 20662816 13593016154501 2023-11-09 07:51:34.937038+01:00 \n",
"1826669 20662817 13593016154680 2023-11-09 07:51:34.938224+01:00 \n",
"1826670 20662818 13593016154899 2023-11-09 07:51:34.939328+01:00 \n",
"1826671 20662819 13593016154988 2023-11-09 07:51:34.940680+01:00 \n",
"\n",
" updated_at purchase_id product_id \\\n",
"0 2022-02-14 18:46:53.614229+01:00 5107462 225251 \n",
"1 2022-02-14 18:46:53.614761+01:00 5107462 224914 \n",
"2 2022-02-14 18:46:53.615521+01:00 5107462 224914 \n",
"3 2022-02-14 18:46:53.616000+01:00 5107462 224914 \n",
"4 2022-02-14 18:46:53.616447+01:00 5107462 224914 \n",
"... ... ... ... \n",
"1826667 2023-11-09 07:51:34.935983+01:00 8007697 405689 \n",
"1826668 2023-11-09 07:51:34.937038+01:00 8007698 403658 \n",
"1826669 2023-11-09 07:51:34.938224+01:00 8007698 403658 \n",
"1826670 2023-11-09 07:51:34.939328+01:00 8007699 403658 \n",
"1826671 2023-11-09 07:51:34.940680+01:00 8007699 403658 \n",
"\n",
" is_from_subscription type_of supplier_id barcode \\\n",
"0 False 1 3 NaN \n",
"1 False 1 3 NaN \n",
"2 False 1 3 NaN \n",
"3 False 1 3 NaN \n",
"4 False 1 3 NaN \n",
"... ... ... ... ... \n",
"1826667 False 1 3 NaN \n",
"1826668 False 1 3 NaN \n",
"1826669 False 1 3 NaN \n",
"1826670 False 1 3 NaN \n",
"1826671 False 1 3 NaN \n",
"\n",
" identifier \n",
"0 b6ad7fc36f33b5e05f58c7fca06688a6 \n",
"1 b0903af480266f27802fe5c38c277c9e \n",
"2 64ca12b7e26a65b90335c0702ea0faba \n",
"3 5ac2f8150aa9f3a6b1599df08cc2f0c7 \n",
"4 dfe30081bae020d12094279926136b9c \n",
"... ... \n",
"1826667 dba9aa428f843b79ae69dfacfe8fc579 \n",
"1826668 93f1fcfc6ba4fa68f92eb4b4a619fcf0 \n",
"1826669 c8bbbd25df2c158767ceef42c3237f23 \n",
"1826670 738f0a8b5088b5056bc3b32eff2dca1f \n",
"1826671 4c5a6195434377380b4e6ae63b2e9cf6 \n",
"\n",
"[1826672 rows x 11 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_tickets"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1826672 entries, 0 to 1826671\n",
"Data columns (total 11 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 id int64 \n",
" 1 number object \n",
" 2 created_at object \n",
" 3 updated_at object \n",
" 4 purchase_id int64 \n",
" 5 product_id int64 \n",
" 6 is_from_subscription bool \n",
" 7 type_of int64 \n",
" 8 supplier_id int64 \n",
" 9 barcode float64\n",
" 10 identifier object \n",
"dtypes: bool(1), float64(1), int64(5), object(4)\n",
"memory usage: 141.1+ MB\n"
]
}
],
"source": [
"df1_tickets.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "c1b42769-03c7-4785-92ce-5e1e6b41908d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id 0.0\n",
"number 0.0\n",
"created_at 0.0\n",
"updated_at 0.0\n",
"purchase_id 0.0\n",
"product_id 0.0\n",
"is_from_subscription 0.0\n",
"type_of 0.0\n",
"supplier_id 0.0\n",
"barcode 100.0\n",
"identifier 0.0\n",
"dtype: float64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_tickets.isna().sum()/len(df1_tickets)*100"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "42896791-2d93-4725-a50b-6c7cbe535ec7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_619/232847087.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n"
]
}
],
"source": [
"# Selection des variables\n",
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52",
"metadata": {},
"source": [
"## suppliers.csv"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2e0dada0-9457-484c-aa55-77e44613ecca",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>manually_added</th>\n",
" <th>label</th>\n",
" <th>itr</th>\n",
" <th>updated_at</th>\n",
" <th>created_at</th>\n",
" <th>commission</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1617</td>\n",
" <td>j4 administration</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2021-07-29 09:21:37.325772+02:00</td>\n",
" <td>2021-07-29 09:21:37.325772+02:00</td>\n",
" <td>NaN</td>\n",
" <td>5958b2a060ac3e31678b438892a1bd2e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8</td>\n",
" <td>non défini</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:16:35.329062+02:00</td>\n",
" <td>2020-09-03 13:16:35.329062+02:00</td>\n",
" <td>NaN</td>\n",
" <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>vad</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.896992+02:00</td>\n",
" <td>2020-09-03 13:11:23.896992+02:00</td>\n",
" <td>NaN</td>\n",
" <td>1225483c97b36018cab2bea14ab78ea6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>fort saint jean</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.833073+02:00</td>\n",
" <td>2020-09-03 13:11:23.833073+02:00</td>\n",
" <td>NaN</td>\n",
" <td>001b9b4a524fe407150b8235b304d4ec</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>j4</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.888993+02:00</td>\n",
" <td>2020-09-03 13:11:23.888993+02:00</td>\n",
" <td>NaN</td>\n",
" <td>6a0cf6edf20060344b465706b61719aa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>revendeur</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.900987+02:00</td>\n",
" <td>2020-09-03 13:11:23.900987+02:00</td>\n",
" <td>NaN</td>\n",
" <td>931239d4acb6214d7e5c98edecfb4916</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3</td>\n",
" <td>vente en ligne</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.893097+02:00</td>\n",
" <td>2020-09-03 13:11:23.893097+02:00</td>\n",
" <td>NaN</td>\n",
" <td>bde8f2ccff510df8572d3214d86b837d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>6</td>\n",
" <td>ccr</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.904974+02:00</td>\n",
" <td>2020-09-03 13:11:23.904974+02:00</td>\n",
" <td>NaN</td>\n",
" <td>b48ec279411f7dbbb68393c61a9724d9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>7</td>\n",
" <td>dab</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.908970+02:00</td>\n",
" <td>2020-09-03 13:11:23.908970+02:00</td>\n",
" <td>NaN</td>\n",
" <td>11c6d471fa4e354e62e684d293694202</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name manually_added label itr \\\n",
"0 1617 j4 administration False NaN NaN \n",
"1 8 non défini False NaN NaN \n",
"2 4 vad False NaN NaN \n",
"3 1 fort saint jean False NaN NaN \n",
"4 2 j4 False NaN NaN \n",
"5 5 revendeur False NaN NaN \n",
"6 3 vente en ligne False NaN NaN \n",
"7 6 ccr False NaN NaN \n",
"8 7 dab False NaN NaN \n",
"\n",
" updated_at created_at \\\n",
"0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n",
"1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n",
"2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n",
"3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n",
"4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n",
"5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n",
"6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n",
"7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n",
"8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n",
"\n",
" commission identifier \n",
"0 NaN 5958b2a060ac3e31678b438892a1bd2e \n",
"1 NaN 52ff3466787b4d538407372e5f7afe0f \n",
"2 NaN 1225483c97b36018cab2bea14ab78ea6 \n",
"3 NaN 001b9b4a524fe407150b8235b304d4ec \n",
"4 NaN 6a0cf6edf20060344b465706b61719aa \n",
"5 NaN 931239d4acb6214d7e5c98edecfb4916 \n",
"6 NaN bde8f2ccff510df8572d3214d86b837d \n",
"7 NaN b48ec279411f7dbbb68393c61a9724d9 \n",
"8 NaN 11c6d471fa4e354e62e684d293694202 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_suppliers"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "b583be02-ab60-4e14-9325-0204f203a1af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 9 entries, 0 to 8\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 9 non-null int64 \n",
" 1 name 9 non-null object \n",
" 2 manually_added 9 non-null bool \n",
" 3 label 0 non-null float64\n",
" 4 itr 0 non-null float64\n",
" 5 updated_at 9 non-null object \n",
" 6 created_at 9 non-null object \n",
" 7 commission 0 non-null float64\n",
" 8 identifier 9 non-null object \n",
"dtypes: bool(1), float64(3), int64(1), object(4)\n",
"memory usage: 713.0+ bytes\n"
]
}
],
"source": [
"df1_suppliers.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id 0.0\n",
"name 0.0\n",
"manually_added 0.0\n",
"label 100.0\n",
"itr 100.0\n",
"updated_at 0.0\n",
"created_at 0.0\n",
"commission 100.0\n",
"identifier 0.0\n",
"dtype: float64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_619/302783287.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n"
]
}
],
"source": [
"# Selection des variables\n",
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "4de7e2e2-6da4-4618-8444-b524399c5493",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>supplier_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1617</td>\n",
" <td>j4 administration</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8</td>\n",
" <td>non défini</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>vad</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>fort saint jean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>j4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>revendeur</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3</td>\n",
" <td>vente en ligne</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>6</td>\n",
" <td>ccr</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>7</td>\n",
" <td>dab</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id supplier_name\n",
"0 1617 j4 administration\n",
"1 8 non défini\n",
"2 4 vad\n",
"3 1 fort saint jean\n",
"4 2 j4\n",
"5 5 revendeur\n",
"6 3 vente en ligne\n",
"7 6 ccr\n",
"8 7 dab"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_suppliers_clean"
]
},
{
"cell_type": "markdown",
"id": "0a6df975-c7fc-45bc-92af-a0bdab17d795",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## type_ofs.csv"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "a02f6594-3e91-4e87-bbb6-649c28d4f7e9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>children</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-01-05 11:55:51.188106+01:00</td>\n",
" <td>2021-01-05 11:55:51.188106+01:00</td>\n",
" <td>623ec4067827558b28972cf39fe81ee7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Billet en nombre</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-01-11 12:13:19.286301+01:00</td>\n",
" <td>2021-01-11 12:13:19.286301+01:00</td>\n",
" <td>a53d313a97296ee37caa066dbfe7a45c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Groupe</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-01-11 12:19:22.842917+01:00</td>\n",
" <td>2021-01-11 12:19:22.842917+01:00</td>\n",
" <td>1ab143efc3b85acbbc752fe8eb2b0b86</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Revendeur</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-01-12 12:34:20.481236+01:00</td>\n",
" <td>2021-01-12 12:34:20.481236+01:00</td>\n",
" <td>8b332723366a07e1eef5f1c92f9ae067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Cinéma scolaire</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-01-25 19:16:05.141719+01:00</td>\n",
" <td>2021-01-25 19:16:05.141719+01:00</td>\n",
" <td>a12e62cb4c4f47e7406bd8fbff2bfe30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>Musée famille</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-01-25 19:23:06.692627+01:00</td>\n",
" <td>2021-01-25 19:23:06.692627+01:00</td>\n",
" <td>1ec6c19283111ccb3ed67f52d414470e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>Spectacle famille</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-01-25 19:28:21.390016+01:00</td>\n",
" <td>2021-01-25 19:28:21.390016+01:00</td>\n",
" <td>05e2104f1b74ced229c06847d6e91938</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>Masterclass</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-01-25 19:31:05.076904+01:00</td>\n",
" <td>2021-01-25 19:31:05.076904+01:00</td>\n",
" <td>9cc946edfb25e11b4282f58db16e6ae9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>Spectacle</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-01-25 19:38:41.260535+01:00</td>\n",
" <td>2021-01-25 19:38:41.260535+01:00</td>\n",
" <td>d88321c347f0e0ab101184cdf25c94bf</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>Cinema</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-02-05 11:12:31.932576+01:00</td>\n",
" <td>2021-02-05 11:12:31.932576+01:00</td>\n",
" <td>0870fef2bfcd5b30a12e4f5c7f4aaba7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11</td>\n",
" <td>Musee</td>\n",
" <td>pricing_formula</td>\n",
" <td>2021-02-05 11:52:05.468207+01:00</td>\n",
" <td>2021-02-05 11:52:05.468207+01:00</td>\n",
" <td>8ba8934454cc62c7cdb3eb6e1b39df0c</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>12</td>\n",
" <td>Tarifs plein</td>\n",
" <td>category</td>\n",
" <td>2023-03-13 11:31:50.528331+01:00</td>\n",
" <td>2023-03-13 11:31:50.528331+01:00</td>\n",
" <td>a6969df76efc15d157be48e87a7bcf9a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name children created_at \\\n",
"0 1 Atelier pricing_formula 2021-01-05 11:55:51.188106+01:00 \n",
"1 2 Billet en nombre pricing_formula 2021-01-11 12:13:19.286301+01:00 \n",
"2 3 Groupe pricing_formula 2021-01-11 12:19:22.842917+01:00 \n",
"3 4 Revendeur pricing_formula 2021-01-12 12:34:20.481236+01:00 \n",
"4 5 Cinéma scolaire pricing_formula 2021-01-25 19:16:05.141719+01:00 \n",
"5 6 Musée famille pricing_formula 2021-01-25 19:23:06.692627+01:00 \n",
"6 7 Spectacle famille pricing_formula 2021-01-25 19:28:21.390016+01:00 \n",
"7 8 Masterclass pricing_formula 2021-01-25 19:31:05.076904+01:00 \n",
"8 9 Spectacle pricing_formula 2021-01-25 19:38:41.260535+01:00 \n",
"9 10 Cinema pricing_formula 2021-02-05 11:12:31.932576+01:00 \n",
"10 11 Musee pricing_formula 2021-02-05 11:52:05.468207+01:00 \n",
"11 12 Tarifs plein category 2023-03-13 11:31:50.528331+01:00 \n",
"\n",
" updated_at identifier \n",
"0 2021-01-05 11:55:51.188106+01:00 623ec4067827558b28972cf39fe81ee7 \n",
"1 2021-01-11 12:13:19.286301+01:00 a53d313a97296ee37caa066dbfe7a45c \n",
"2 2021-01-11 12:19:22.842917+01:00 1ab143efc3b85acbbc752fe8eb2b0b86 \n",
"3 2021-01-12 12:34:20.481236+01:00 8b332723366a07e1eef5f1c92f9ae067 \n",
"4 2021-01-25 19:16:05.141719+01:00 a12e62cb4c4f47e7406bd8fbff2bfe30 \n",
"5 2021-01-25 19:23:06.692627+01:00 1ec6c19283111ccb3ed67f52d414470e \n",
"6 2021-01-25 19:28:21.390016+01:00 05e2104f1b74ced229c06847d6e91938 \n",
"7 2021-01-25 19:31:05.076904+01:00 9cc946edfb25e11b4282f58db16e6ae9 \n",
"8 2021-01-25 19:38:41.260535+01:00 d88321c347f0e0ab101184cdf25c94bf \n",
"9 2021-02-05 11:12:31.932576+01:00 0870fef2bfcd5b30a12e4f5c7f4aaba7 \n",
"10 2021-02-05 11:52:05.468207+01:00 8ba8934454cc62c7cdb3eb6e1b39df0c \n",
"11 2023-03-13 11:31:50.528331+01:00 a6969df76efc15d157be48e87a7bcf9a "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_type_ofs"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "e9c8d32b-22f4-4581-8af7-31cc1c31fa0e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 12 entries, 0 to 11\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 12 non-null int64 \n",
" 1 name 12 non-null object\n",
" 2 children 12 non-null object\n",
" 3 created_at 12 non-null object\n",
" 4 updated_at 12 non-null object\n",
" 5 identifier 12 non-null object\n",
"dtypes: int64(1), object(5)\n",
"memory usage: 704.0+ bytes\n"
]
}
],
"source": [
"df1_type_ofs.info()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_619/81842251.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
]
}
],
"source": [
"# Selection des variables\n",
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## purchases.csv"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>purchase_date</th>\n",
" <th>customer_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>number</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5145662</td>\n",
" <td>2019-07-17 11:17:53+02:00</td>\n",
" <td>6632</td>\n",
" <td>2021-12-28 20:48:51.569237+01:00</td>\n",
" <td>2021-12-28 20:48:51.569237+01:00</td>\n",
" <td>fa80c83b29a268b45728c910a8afcf79</td>\n",
" <td>82877c41df26f832eb823a83acd1a172</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4941642</td>\n",
" <td>2018-10-31 11:59:00+01:00</td>\n",
" <td>1</td>\n",
" <td>2021-12-28 20:31:48.196681+01:00</td>\n",
" <td>2022-03-03 17:52:21.958861+01:00</td>\n",
" <td>597b6c06adfe6acc539b29b657b80da0</td>\n",
" <td>e7102ebe65526c427245533ebabe66e5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5088860</td>\n",
" <td>2018-10-31 12:45:12+01:00</td>\n",
" <td>1</td>\n",
" <td>2021-12-28 20:46:34.703542+01:00</td>\n",
" <td>2021-12-28 20:46:34.703542+01:00</td>\n",
" <td>4a7f6baaf9be6a99e3fead7f7e981fa8</td>\n",
" <td>af75c4ae53d1b6957875538355b162e1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5088862</td>\n",
" <td>2018-10-31 13:07:12+01:00</td>\n",
" <td>1</td>\n",
" <td>2021-12-28 20:46:34.704773+01:00</td>\n",
" <td>2021-12-28 20:46:34.704773+01:00</td>\n",
" <td>1d83dfad44b73070d1c6d5875d0edd2d</td>\n",
" <td>4b2fe34659b177209b07270ae1043b40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5088863</td>\n",
" <td>2018-10-31 13:08:50+01:00</td>\n",
" <td>1</td>\n",
" <td>2021-12-28 20:46:34.705453+01:00</td>\n",
" <td>2021-12-28 20:46:34.705453+01:00</td>\n",
" <td>7bfe2bc9c1670c973d0960e3fd408cf8</td>\n",
" <td>b115f04a99b94df9e4a32185844f0998</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>742245</th>\n",
" <td>8007695</td>\n",
" <td>2023-11-08 17:51:19+01:00</td>\n",
" <td>1256133</td>\n",
" <td>2023-11-09 07:51:33.920187+01:00</td>\n",
" <td>2023-11-09 07:51:33.920187+01:00</td>\n",
" <td>99ad774dedbad43feb73514765d2f0ba</td>\n",
" <td>d68558180b4bf2e8a945724843655775</td>\n",
" </tr>\n",
" <tr>\n",
" <th>742246</th>\n",
" <td>8007696</td>\n",
" <td>2023-11-08 18:17:51+01:00</td>\n",
" <td>1256134</td>\n",
" <td>2023-11-09 07:51:33.921967+01:00</td>\n",
" <td>2023-11-09 07:51:33.921967+01:00</td>\n",
" <td>c1511614c511c5f95980172690179102</td>\n",
" <td>f5102d910a7731091f239ad7b0df35b4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>742247</th>\n",
" <td>8007697</td>\n",
" <td>2023-11-08 18:23:54+01:00</td>\n",
" <td>1256135</td>\n",
" <td>2023-11-09 07:51:33.923034+01:00</td>\n",
" <td>2023-11-09 07:51:33.923034+01:00</td>\n",
" <td>33b64b39cc53428b4f17d65ff5b93104</td>\n",
" <td>e2b917626be60cc2c3207cc037fe69e4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>742248</th>\n",
" <td>8007698</td>\n",
" <td>2023-11-08 19:32:18+01:00</td>\n",
" <td>1256136</td>\n",
" <td>2023-11-09 07:51:33.924135+01:00</td>\n",
" <td>2023-11-09 07:51:33.924135+01:00</td>\n",
" <td>9ae0b129e704b3d9c093ce9c7c4e5039</td>\n",
" <td>5bfa23236c31f8562c3a0233c1b53b31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>742249</th>\n",
" <td>8007699</td>\n",
" <td>2023-11-08 20:30:28+01:00</td>\n",
" <td>1256137</td>\n",
" <td>2023-11-09 07:51:33.925382+01:00</td>\n",
" <td>2023-11-09 07:51:33.925382+01:00</td>\n",
" <td>d31ced089c2b1f90479257a4686f9306</td>\n",
" <td>d86b1e0de3ff01eaf04fbcd031ac5fef</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>742250 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" id purchase_date customer_id \\\n",
"0 5145662 2019-07-17 11:17:53+02:00 6632 \n",
"1 4941642 2018-10-31 11:59:00+01:00 1 \n",
"2 5088860 2018-10-31 12:45:12+01:00 1 \n",
"3 5088862 2018-10-31 13:07:12+01:00 1 \n",
"4 5088863 2018-10-31 13:08:50+01:00 1 \n",
"... ... ... ... \n",
"742245 8007695 2023-11-08 17:51:19+01:00 1256133 \n",
"742246 8007696 2023-11-08 18:17:51+01:00 1256134 \n",
"742247 8007697 2023-11-08 18:23:54+01:00 1256135 \n",
"742248 8007698 2023-11-08 19:32:18+01:00 1256136 \n",
"742249 8007699 2023-11-08 20:30:28+01:00 1256137 \n",
"\n",
" created_at updated_at \\\n",
"0 2021-12-28 20:48:51.569237+01:00 2021-12-28 20:48:51.569237+01:00 \n",
"1 2021-12-28 20:31:48.196681+01:00 2022-03-03 17:52:21.958861+01:00 \n",
"2 2021-12-28 20:46:34.703542+01:00 2021-12-28 20:46:34.703542+01:00 \n",
"3 2021-12-28 20:46:34.704773+01:00 2021-12-28 20:46:34.704773+01:00 \n",
"4 2021-12-28 20:46:34.705453+01:00 2021-12-28 20:46:34.705453+01:00 \n",
"... ... ... \n",
"742245 2023-11-09 07:51:33.920187+01:00 2023-11-09 07:51:33.920187+01:00 \n",
"742246 2023-11-09 07:51:33.921967+01:00 2023-11-09 07:51:33.921967+01:00 \n",
"742247 2023-11-09 07:51:33.923034+01:00 2023-11-09 07:51:33.923034+01:00 \n",
"742248 2023-11-09 07:51:33.924135+01:00 2023-11-09 07:51:33.924135+01:00 \n",
"742249 2023-11-09 07:51:33.925382+01:00 2023-11-09 07:51:33.925382+01:00 \n",
"\n",
" number identifier \n",
"0 fa80c83b29a268b45728c910a8afcf79 82877c41df26f832eb823a83acd1a172 \n",
"1 597b6c06adfe6acc539b29b657b80da0 e7102ebe65526c427245533ebabe66e5 \n",
"2 4a7f6baaf9be6a99e3fead7f7e981fa8 af75c4ae53d1b6957875538355b162e1 \n",
"3 1d83dfad44b73070d1c6d5875d0edd2d 4b2fe34659b177209b07270ae1043b40 \n",
"4 7bfe2bc9c1670c973d0960e3fd408cf8 b115f04a99b94df9e4a32185844f0998 \n",
"... ... ... \n",
"742245 99ad774dedbad43feb73514765d2f0ba d68558180b4bf2e8a945724843655775 \n",
"742246 c1511614c511c5f95980172690179102 f5102d910a7731091f239ad7b0df35b4 \n",
"742247 33b64b39cc53428b4f17d65ff5b93104 e2b917626be60cc2c3207cc037fe69e4 \n",
"742248 9ae0b129e704b3d9c093ce9c7c4e5039 5bfa23236c31f8562c3a0233c1b53b31 \n",
"742249 d31ced089c2b1f90479257a4686f9306 d86b1e0de3ff01eaf04fbcd031ac5fef \n",
"\n",
"[742250 rows x 7 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_purchases"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 742250 entries, 0 to 742249\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 742250 non-null int64 \n",
" 1 purchase_date 742250 non-null object\n",
" 2 customer_id 742250 non-null int64 \n",
" 3 created_at 742250 non-null object\n",
" 4 updated_at 742250 non-null object\n",
" 5 number 742250 non-null object\n",
" 6 identifier 742250 non-null object\n",
"dtypes: int64(2), object(5)\n",
"memory usage: 39.6+ MB\n"
]
}
],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd",
"metadata": {},
"outputs": [],
"source": [
"# Nettoyage purchase_date\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "27d18584-228f-4698-85d6-4d23151ea5ed",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 742250 entries, 0 to 742249\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 742250 non-null int64 \n",
" 1 purchase_date 742250 non-null datetime64[ns, UTC]\n",
" 2 customer_id 742250 non-null int64 \n",
" 3 created_at 742250 non-null object \n",
" 4 updated_at 742250 non-null object \n",
" 5 number 742250 non-null object \n",
" 6 identifier 742250 non-null object \n",
"dtypes: datetime64[ns, UTC](1), int64(2), object(4)\n",
"memory usage: 39.6+ MB\n"
]
}
],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
]
},
{
"cell_type": "markdown",
"id": "53227600-c1c5-48aa-9f5d-db5a23a8a22a",
"metadata": {},
"source": [
"## Fusion de l'ensemble des données billétiques"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "e0b8b47a-b321-4a79-823c-36a131a78ac7",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec fournisseurs\n",
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec type de tickets\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec achats\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>product_id</th>\n",
" <th>is_from_subscription</th>\n",
" <th>supplier_name</th>\n",
" <th>type_of_ticket_name</th>\n",
" <th>children</th>\n",
" <th>purchase_date</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13070859</td>\n",
" <td>225251</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13070860</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13070861</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13070862</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13070863</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826667</th>\n",
" <td>20662815</td>\n",
" <td>405689</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2023-11-08 17:23:54+00:00</td>\n",
" <td>1256135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826668</th>\n",
" <td>20662816</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2023-11-08 18:32:18+00:00</td>\n",
" <td>1256136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826669</th>\n",
" <td>20662817</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2023-11-08 18:32:18+00:00</td>\n",
" <td>1256136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826670</th>\n",
" <td>20662818</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2023-11-08 19:30:28+00:00</td>\n",
" <td>1256137</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826671</th>\n",
" <td>20662819</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2023-11-08 19:30:28+00:00</td>\n",
" <td>1256137</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1826672 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" ticket_id product_id is_from_subscription supplier_name \\\n",
"0 13070859 225251 False vente en ligne \n",
"1 13070860 224914 False vente en ligne \n",
"2 13070861 224914 False vente en ligne \n",
"3 13070862 224914 False vente en ligne \n",
"4 13070863 224914 False vente en ligne \n",
"... ... ... ... ... \n",
"1826667 20662815 405689 False vente en ligne \n",
"1826668 20662816 403658 False vente en ligne \n",
"1826669 20662817 403658 False vente en ligne \n",
"1826670 20662818 403658 False vente en ligne \n",
"1826671 20662819 403658 False vente en ligne \n",
"\n",
" type_of_ticket_name children purchase_date \\\n",
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"... ... ... ... \n",
"1826667 Atelier pricing_formula 2023-11-08 17:23:54+00:00 \n",
"1826668 Atelier pricing_formula 2023-11-08 18:32:18+00:00 \n",
"1826669 Atelier pricing_formula 2023-11-08 18:32:18+00:00 \n",
"1826670 Atelier pricing_formula 2023-11-08 19:30:28+00:00 \n",
"1826671 Atelier pricing_formula 2023-11-08 19:30:28+00:00 \n",
"\n",
" customer_id \n",
"0 48187 \n",
"1 48187 \n",
"2 48187 \n",
"3 48187 \n",
"4 48187 \n",
"... ... \n",
"1826667 1256135 \n",
"1826668 1256136 \n",
"1826669 1256136 \n",
"1826670 1256137 \n",
"1826671 1256137 \n",
"\n",
"[1826672 rows x 8 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
"id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75",
"metadata": {},
"source": [
"# Utilisation de fonctions"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "c1afe322-ff41-4760-819e-0195fed5b27d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 20 entries, 0 to 19\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 opened_at 8 non-null object \n",
" 1 opened_at_clean 8 non-null datetime64[ns, UTC]\n",
"dtypes: datetime64[ns, UTC](1), object(1)\n",
"memory usage: 448.0+ bytes\n"
]
}
],
"source": [
"# Créer un DataFrame exemple\n",
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
"\n",
"# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
"df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
"df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
"\n",
"test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
"\n",
"test.info()"
]
},
{
"cell_type": "markdown",
"id": "27ecf058-23eb-4018-abbd-68c4ebe7c786",
"metadata": {},
"source": [
"## Nettoyage, selection et fusion"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "d887898c-6a21-41ed-901d-4d6fdbca5372",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>product_id</th>\n",
" <th>is_from_subscription</th>\n",
" <th>type_of</th>\n",
" <th>supplier_name</th>\n",
" <th>purchase_date</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13070859</td>\n",
" <td>225251</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13070860</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13070861</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13070862</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13070863</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826667</th>\n",
" <td>20662815</td>\n",
" <td>405689</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 17:23:54+00:00</td>\n",
" <td>1256135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826668</th>\n",
" <td>20662816</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 18:32:18+00:00</td>\n",
" <td>1256136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826669</th>\n",
" <td>20662817</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 18:32:18+00:00</td>\n",
" <td>1256136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826670</th>\n",
" <td>20662818</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 19:30:28+00:00</td>\n",
" <td>1256137</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826671</th>\n",
" <td>20662819</td>\n",
" <td>403658</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 19:30:28+00:00</td>\n",
" <td>1256137</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1826672 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" ticket_id product_id is_from_subscription type_of supplier_name \\\n",
"0 13070859 225251 False 1 vente en ligne \n",
"1 13070860 224914 False 1 vente en ligne \n",
"2 13070861 224914 False 1 vente en ligne \n",
"3 13070862 224914 False 1 vente en ligne \n",
"4 13070863 224914 False 1 vente en ligne \n",
"... ... ... ... ... ... \n",
"1826667 20662815 405689 False 1 vente en ligne \n",
"1826668 20662816 403658 False 1 vente en ligne \n",
"1826669 20662817 403658 False 1 vente en ligne \n",
"1826670 20662818 403658 False 1 vente en ligne \n",
"1826671 20662819 403658 False 1 vente en ligne \n",
"\n",
" purchase_date customer_id \n",
"0 2018-12-28 14:47:50+00:00 48187 \n",
"1 2018-12-28 14:47:50+00:00 48187 \n",
"2 2018-12-28 14:47:50+00:00 48187 \n",
"3 2018-12-28 14:47:50+00:00 48187 \n",
"4 2018-12-28 14:47:50+00:00 48187 \n",
"... ... ... \n",
"1826667 2023-11-08 17:23:54+00:00 1256135 \n",
"1826668 2023-11-08 18:32:18+00:00 1256136 \n",
"1826669 2023-11-08 18:32:18+00:00 1256136 \n",
"1826670 2023-11-08 19:30:28+00:00 1256137 \n",
"1826671 2023-11-08 19:30:28+00:00 1256137 \n",
"\n",
"[1826672 rows x 7 columns]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "ac9a6373-c1c6-46b5-873b-dc22f17bcbdb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1826672 entries, 0 to 1826671\n",
"Data columns (total 7 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 ticket_id int64 \n",
" 1 product_id int64 \n",
" 2 is_from_subscription bool \n",
" 3 type_of int64 \n",
" 4 supplier_name object \n",
" 5 purchase_date datetime64[ns, UTC]\n",
" 6 customer_id int64 \n",
"dtypes: bool(1), datetime64[ns, UTC](1), int64(4), object(1)\n",
"memory usage: 85.4+ MB\n"
]
}
],
"source": [
"df1_ticket_information.info()"
]
},
{
"cell_type": "markdown",
"id": "b1719943-89eb-4ba0-a107-2f96d5d01ec9",
"metadata": {},
"source": [
"# Customer information"
]
},
{
"cell_type": "markdown",
"id": "a2132ee2-3f22-45fd-b65b-72689c8b672c",
"metadata": {},
"source": [
"## Target area"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "da5d4708-7147-4cc8-8686-52d4bcba5a7a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
]
}
],
"source": [
"# Target.csv cleaning\n",
"df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
"df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
"\n",
"# target_type cleaning\n",
"df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
"\n",
"#customer_target_mappings cleaning\n",
"df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
"\n",
"# Merge target et target_type\n",
"df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
"df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
"\n",
"# Merge\n",
"df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
"df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "b4fa5fe3-ce8e-4b0a-af94-fb468d241bad",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id 5.080902\n",
"dtype: float64"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
"\n",
"# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
"df1_targets_test.mean()\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c74746de-0bf4-4b83-9a75-f1d3183abf1c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_name</th>\n",
" <th>target_type_is_import</th>\n",
" <th>target_type_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1184824</td>\n",
" <td>645400</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>210571</td>\n",
" <td>2412</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>210572</td>\n",
" <td>4536</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>210573</td>\n",
" <td>6736</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>210574</td>\n",
" <td>38210</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id customer_id target_name target_type_is_import \\\n",
"0 1184824 645400 DDCP PROMO Réseau livres False \n",
"1 210571 2412 DDCP PROMO Réseau livres False \n",
"2 210572 4536 DDCP PROMO Réseau livres False \n",
"3 210573 6736 DDCP PROMO Réseau livres False \n",
"4 210574 38210 DDCP PROMO Réseau livres False \n",
"\n",
" target_type_name \n",
"0 manual_static_filter \n",
"1 manual_static_filter \n",
"2 manual_static_filter \n",
"3 manual_static_filter \n",
"4 manual_static_filter "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_targets_full.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "47c55fa0-b2f3-46f9-9abf-c4ab66bd9fcb",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Catégorisation des target_name\n",
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.probability import FreqDist\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8af1aeb9-ebdd-4286-a14c-3b7d801ea172",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n"
]
}
],
"source": [
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"\n",
"# Appliquer le prétraitement à la colonne de texte\n",
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
"\n",
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
"\n",
"# Calculer la fréquence des mots\n",
"freq_dist = FreqDist(all_words)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "ceb069e5-76c9-46e4-9ea7-8c16eb4ed3cd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n",
"specialisee: 150001\n",
"b2c: 143432\n",
"optout: 97683\n",
"newsletter: 56022\n",
"(: 46084\n",
"): 46084\n",
"inscrits: 42296\n",
"nl: 42294\n",
"générale: 41037\n",
"generale: 40950\n"
]
}
],
"source": [
"# Affichage des mots les plus fréquents\n",
"print(\"Mots les plus fréquents:\")\n",
"for mot, freq in freq_dist.most_common(15):\n",
" print(f\"{mot}: {freq}\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "8bffef87-542e-4775-bc7c-2c0323fda581",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" texte \\\n",
"0 Le chat noir mange une souris. \n",
"1 Le chien blanc aboie. \n",
"\n",
" texte_preprocessed \n",
"0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n",
"1 [e, h, i, e, b, a, a, b, o, i, e, .] \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
}
],
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"# Création de la DataFrame d'exemple\n",
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Fonction pour prétraiter le texte\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
"\n",
"# Afficher le résultat\n",
"print(df)\n"
]
},
{
"cell_type": "markdown",
"id": "2f665824-a026-4acd-8358-b408a61854b4",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Campaign area"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "5d05203c-ea30-4208-a29f-fef7737c672e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
"/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
"/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
]
}
],
"source": [
"# campaign_stats cleaning \n",
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
"cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
"\n",
"# campaigns cleaning\n",
"df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
"cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
"\n",
"# Merge \n",
"df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
"df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "8ac634cf-2a30-4ccc-a34d-0fd401a49aaa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 6214808 entries, 0 to 6214807\n",
"Data columns (total 8 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 id int64 \n",
" 1 customer_id int64 \n",
" 2 opened_at datetime64[ns, UTC]\n",
" 3 sent_at datetime64[ns, UTC]\n",
" 4 delivered_at datetime64[ns, UTC]\n",
" 5 campaign_name object \n",
" 6 campaign_service_id int64 \n",
" 7 campaign_sent_at datetime64[ns, UTC]\n",
"dtypes: datetime64[ns, UTC](4), int64(3), object(1)\n",
"memory usage: 379.3+ MB\n"
]
}
],
"source": [
"df1_campaigns_full.info()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "7d22cdd5-2060-4922-8e04-27b613d4ee27",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>opened_at</th>\n",
" <th>sent_at</th>\n",
" <th>delivered_at</th>\n",
" <th>campaign_name</th>\n",
" <th>campaign_service_id</th>\n",
" <th>campaign_sent_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>19793</td>\n",
" <td>112597</td>\n",
" <td>NaT</td>\n",
" <td>2021-03-28 16:01:09+00:00</td>\n",
" <td>2021-03-28 16:24:18+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>14211</td>\n",
" <td>113666</td>\n",
" <td>NaT</td>\n",
" <td>2021-03-28 16:01:09+00:00</td>\n",
" <td>2021-03-28 16:21:02+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13150</td>\n",
" <td>280561</td>\n",
" <td>NaT</td>\n",
" <td>2021-03-28 16:00:59+00:00</td>\n",
" <td>2021-03-28 16:08:45+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7073</td>\n",
" <td>101007</td>\n",
" <td>2021-03-28 18:11:06+00:00</td>\n",
" <td>2021-03-28 16:00:59+00:00</td>\n",
" <td>2021-03-28 16:09:47+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5175</td>\n",
" <td>103972</td>\n",
" <td>NaT</td>\n",
" <td>2021-03-28 16:01:06+00:00</td>\n",
" <td>2021-03-28 16:05:03+00:00</td>\n",
" <td>Le Mucem chez vous, gardons le lien #22</td>\n",
" <td>404</td>\n",
" <td>2021-03-27 23:00:00+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6214803</th>\n",
" <td>8302994</td>\n",
" <td>266155</td>\n",
" <td>2023-10-23 09:43:25+00:00</td>\n",
" <td>2023-10-23 09:32:33+00:00</td>\n",
" <td>2023-10-23 09:32:34+00:00</td>\n",
" <td>dre_nov_2023</td>\n",
" <td>1318</td>\n",
" <td>2023-10-23 09:31:17+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6214804</th>\n",
" <td>8303307</td>\n",
" <td>21355</td>\n",
" <td>2023-10-23 09:44:02+00:00</td>\n",
" <td>2023-10-23 09:32:49+00:00</td>\n",
" <td>2023-10-23 09:32:49+00:00</td>\n",
" <td>dre_nov_2023</td>\n",
" <td>1318</td>\n",
" <td>2023-10-23 09:31:17+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6214805</th>\n",
" <td>8304346</td>\n",
" <td>21849</td>\n",
" <td>2023-10-23 09:45:52+00:00</td>\n",
" <td>2023-10-23 09:33:28+00:00</td>\n",
" <td>2023-10-23 09:33:29+00:00</td>\n",
" <td>dre_nov_2023</td>\n",
" <td>1318</td>\n",
" <td>2023-10-23 09:31:17+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6214806</th>\n",
" <td>8302037</td>\n",
" <td>667789</td>\n",
" <td>2023-10-23 09:47:32+00:00</td>\n",
" <td>2023-10-23 09:31:53+00:00</td>\n",
" <td>2023-10-23 09:31:54+00:00</td>\n",
" <td>dre_nov_2023</td>\n",
" <td>1318</td>\n",
" <td>2023-10-23 09:31:17+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6214807</th>\n",
" <td>8304939</td>\n",
" <td>294154</td>\n",
" <td>NaT</td>\n",
" <td>2023-10-23 09:33:54+00:00</td>\n",
" <td>2023-10-23 09:33:55+00:00</td>\n",
" <td>dre_nov_2023</td>\n",
" <td>1318</td>\n",
" <td>2023-10-23 09:31:17+00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6214808 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" id customer_id opened_at \\\n",
"0 19793 112597 NaT \n",
"1 14211 113666 NaT \n",
"2 13150 280561 NaT \n",
"3 7073 101007 2021-03-28 18:11:06+00:00 \n",
"4 5175 103972 NaT \n",
"... ... ... ... \n",
"6214803 8302994 266155 2023-10-23 09:43:25+00:00 \n",
"6214804 8303307 21355 2023-10-23 09:44:02+00:00 \n",
"6214805 8304346 21849 2023-10-23 09:45:52+00:00 \n",
"6214806 8302037 667789 2023-10-23 09:47:32+00:00 \n",
"6214807 8304939 294154 NaT \n",
"\n",
" sent_at delivered_at \\\n",
"0 2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00 \n",
"1 2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00 \n",
"2 2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00 \n",
"3 2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00 \n",
"4 2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00 \n",
"... ... ... \n",
"6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00 \n",
"6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00 \n",
"6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00 \n",
"6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00 \n",
"6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00 \n",
"\n",
" campaign_name campaign_service_id \\\n",
"0 Le Mucem chez vous, gardons le lien #22 404 \n",
"1 Le Mucem chez vous, gardons le lien #22 404 \n",
"2 Le Mucem chez vous, gardons le lien #22 404 \n",
"3 Le Mucem chez vous, gardons le lien #22 404 \n",
"4 Le Mucem chez vous, gardons le lien #22 404 \n",
"... ... ... \n",
"6214803 dre_nov_2023 1318 \n",
"6214804 dre_nov_2023 1318 \n",
"6214805 dre_nov_2023 1318 \n",
"6214806 dre_nov_2023 1318 \n",
"6214807 dre_nov_2023 1318 \n",
"\n",
" campaign_sent_at \n",
"0 2021-03-27 23:00:00+00:00 \n",
"1 2021-03-27 23:00:00+00:00 \n",
"2 2021-03-27 23:00:00+00:00 \n",
"3 2021-03-27 23:00:00+00:00 \n",
"4 2021-03-27 23:00:00+00:00 \n",
"... ... \n",
"6214803 2023-10-23 09:31:17+00:00 \n",
"6214804 2023-10-23 09:31:17+00:00 \n",
"6214805 2023-10-23 09:31:17+00:00 \n",
"6214806 2023-10-23 09:31:17+00:00 \n",
"6214807 2023-10-23 09:31:17+00:00 \n",
"\n",
"[6214808 rows x 8 columns]"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "markdown",
"id": "0a5b24f0-4bca-4cde-a6ba-eb130b38cac4",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Link area"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "bc63bc4e-6cc1-4d35-9635-faf55339e186",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>service_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>process_id</th>\n",
" <th>report_url</th>\n",
" <th>category</th>\n",
" <th>to_be_synced</th>\n",
" <th>identifier</th>\n",
" <th>sent_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1319613</td>\n",
" <td>newsletter enseignants janvier 2022</td>\n",
" <td>721</td>\n",
" <td>2022-01-14 16:06:42.586321+01:00</td>\n",
" <td>2022-02-03 14:17:27.112963+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>aba3b6fd5d186d28e06ff97135cade7f</td>\n",
" <td>2022-01-14 00:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1319586</td>\n",
" <td>lsf_janvier_2022</td>\n",
" <td>717</td>\n",
" <td>2022-01-07 11:30:35.315895+01:00</td>\n",
" <td>2022-02-03 14:17:27.116171+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>788d986905533aba051261497ecffcbb</td>\n",
" <td>2022-01-07 00:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1319282</td>\n",
" <td>Invitation à déjeuner au Mucem | Vernissage « ...</td>\n",
" <td>591</td>\n",
" <td>2021-09-28 12:50:24.448752+02:00</td>\n",
" <td>2022-02-03 14:17:27.119582+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>3493894fa4ea036cfc6433c3e2ee63b0</td>\n",
" <td>2021-09-28 00:00:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1319283</td>\n",
" <td>Vacances de la Toussaint - centres des loisirs</td>\n",
" <td>590</td>\n",
" <td>2021-09-28 18:01:04.692073+02:00</td>\n",
" <td>2022-02-03 14:17:27.124408+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>08b255a5d42b89b0585260b6f2360bdd</td>\n",
" <td>2021-09-28 00:00:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1319636</td>\n",
" <td>ddcp_promo_md_livemag</td>\n",
" <td>730</td>\n",
" <td>2022-01-27 18:00:41.053069+01:00</td>\n",
" <td>2022-02-03 14:17:27.127607+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>d5cfead94f5350c12c322b5b664544c1</td>\n",
" <td>2022-01-27 00:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>952</th>\n",
" <td>1320072</td>\n",
" <td>dre_gaza0106</td>\n",
" <td>881</td>\n",
" <td>2022-05-26 09:01:35.523639+02:00</td>\n",
" <td>2022-12-02 17:51:22.614046+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>7504adad8bb96320eb3afdd4df6e1f60</td>\n",
" <td>2022-05-26 00:00:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>953</th>\n",
" <td>661398</td>\n",
" <td>DDCP Plan Bis 4 - Marketing direct - MJ5C</td>\n",
" <td>183</td>\n",
" <td>2021-06-18 10:30:01.259578+02:00</td>\n",
" <td>2021-09-24 11:56:09.082785+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>cedebb6e872f539bef8c3f919874e9d7</td>\n",
" <td>2020-07-27 00:00:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>954</th>\n",
" <td>1320487</td>\n",
" <td>Invitation portes ouvertes amitiés</td>\n",
" <td>988</td>\n",
" <td>2022-09-29 18:01:33.834090+02:00</td>\n",
" <td>2022-12-02 17:51:23.258324+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>9908279ebbf1f9b250ba689db6a0222b</td>\n",
" <td>2022-09-29 00:00:00+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>955</th>\n",
" <td>906903</td>\n",
" <td>DDCP PROMO La méditerranée des philosophes #3 ...</td>\n",
" <td>310</td>\n",
" <td>2021-07-19 14:07:16.177390+02:00</td>\n",
" <td>2021-09-24 11:56:09.086101+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>06eb61b839a0cefee4967c67ccb099dc</td>\n",
" <td>2020-12-23 00:00:00+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>956</th>\n",
" <td>579313</td>\n",
" <td>ddcp_promo_automation_manuel_pre_visit</td>\n",
" <td>481</td>\n",
" <td>2021-06-08 17:38:54.041310+02:00</td>\n",
" <td>2021-09-24 11:56:09.089394+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>9461cce28ebe3e76fb4b931c35a169b0</td>\n",
" <td>2021-06-08 00:00:00+02:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>957 rows × 11 columns</p>\n",
"</div>"
],
"text/plain": [
" id name service_id \\\n",
"0 1319613 newsletter enseignants janvier 2022 721 \n",
"1 1319586 lsf_janvier_2022 717 \n",
"2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n",
"3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n",
"4 1319636 ddcp_promo_md_livemag 730 \n",
".. ... ... ... \n",
"952 1320072 dre_gaza0106 881 \n",
"953 661398 DDCP Plan Bis 4 - Marketing direct - MJ5C 183 \n",
"954 1320487 Invitation portes ouvertes amitiés 988 \n",
"955 906903 DDCP PROMO La méditerranée des philosophes #3 ... 310 \n",
"956 579313 ddcp_promo_automation_manuel_pre_visit 481 \n",
"\n",
" created_at updated_at \\\n",
"0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n",
"1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n",
"2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n",
"3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n",
"4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n",
".. ... ... \n",
"952 2022-05-26 09:01:35.523639+02:00 2022-12-02 17:51:22.614046+01:00 \n",
"953 2021-06-18 10:30:01.259578+02:00 2021-09-24 11:56:09.082785+02:00 \n",
"954 2022-09-29 18:01:33.834090+02:00 2022-12-02 17:51:23.258324+01:00 \n",
"955 2021-07-19 14:07:16.177390+02:00 2021-09-24 11:56:09.086101+02:00 \n",
"956 2021-06-08 17:38:54.041310+02:00 2021-09-24 11:56:09.089394+02:00 \n",
"\n",
" process_id report_url category to_be_synced \\\n",
"0 NaN NaN 0.0 False \n",
"1 NaN NaN 0.0 False \n",
"2 NaN NaN 0.0 False \n",
"3 NaN NaN 0.0 False \n",
"4 NaN NaN 0.0 False \n",
".. ... ... ... ... \n",
"952 NaN NaN 0.0 False \n",
"953 NaN NaN 0.0 False \n",
"954 NaN NaN 0.0 False \n",
"955 NaN NaN 0.0 False \n",
"956 NaN NaN 0.0 False \n",
"\n",
" identifier sent_at \n",
"0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n",
"1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n",
"2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n",
"3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n",
"4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 \n",
".. ... ... \n",
"952 7504adad8bb96320eb3afdd4df6e1f60 2022-05-26 00:00:00+02:00 \n",
"953 cedebb6e872f539bef8c3f919874e9d7 2020-07-27 00:00:00+02:00 \n",
"954 9908279ebbf1f9b250ba689db6a0222b 2022-09-29 00:00:00+02:00 \n",
"955 06eb61b839a0cefee4967c67ccb099dc 2020-12-23 00:00:00+01:00 \n",
"956 9461cce28ebe3e76fb4b931c35a169b0 2021-06-08 00:00:00+02:00 \n",
"\n",
"[957 rows x 11 columns]"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_campaigns"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "c19b321f-65f9-4d6c-8c1f-edb2eb9d70e7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>clicked_at</th>\n",
" <th>link_id</th>\n",
" <th>customer_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2021-03-26 16:30:36+01:00</td>\n",
" <td>1</td>\n",
" <td>284033</td>\n",
" <td>2021-03-26 15:30:37.050161+01:00</td>\n",
" <td>2021-03-26 15:30:37.050161+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>2021-03-26 17:16:34+01:00</td>\n",
" <td>2</td>\n",
" <td>119768</td>\n",
" <td>2021-03-26 16:16:34.950871+01:00</td>\n",
" <td>2021-03-26 16:16:34.950871+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>272</td>\n",
" <td>2021-03-28 20:03:32+02:00</td>\n",
" <td>42</td>\n",
" <td>113105</td>\n",
" <td>2021-03-28 18:03:32.736394+02:00</td>\n",
" <td>2021-03-28 18:03:32.736394+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>2021-03-26 17:43:19+01:00</td>\n",
" <td>3</td>\n",
" <td>272280</td>\n",
" <td>2021-03-26 16:43:19.338321+01:00</td>\n",
" <td>2021-03-26 16:43:19.338321+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>2021-03-26 17:46:00+01:00</td>\n",
" <td>3</td>\n",
" <td>105095</td>\n",
" <td>2021-03-26 16:46:00.502945+01:00</td>\n",
" <td>2021-03-26 16:46:00.502945+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151046</th>\n",
" <td>243553</td>\n",
" <td>2023-11-09 16:34:27+01:00</td>\n",
" <td>14666</td>\n",
" <td>998</td>\n",
" <td>2023-11-09 15:34:29.425425+01:00</td>\n",
" <td>2023-11-09 15:34:29.425425+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151047</th>\n",
" <td>243554</td>\n",
" <td>2023-11-09 16:34:35+01:00</td>\n",
" <td>14670</td>\n",
" <td>998</td>\n",
" <td>2023-11-09 15:34:37.505505+01:00</td>\n",
" <td>2023-11-09 15:34:37.505505+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151048</th>\n",
" <td>243559</td>\n",
" <td>2023-11-09 16:51:15+01:00</td>\n",
" <td>14686</td>\n",
" <td>82923</td>\n",
" <td>2023-11-09 15:51:17.439518+01:00</td>\n",
" <td>2023-11-09 15:51:17.439518+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151049</th>\n",
" <td>243561</td>\n",
" <td>2023-11-09 16:59:42+01:00</td>\n",
" <td>14677</td>\n",
" <td>82923</td>\n",
" <td>2023-11-09 15:59:44.030922+01:00</td>\n",
" <td>2023-11-09 15:59:44.030922+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151050</th>\n",
" <td>243564</td>\n",
" <td>2023-11-09 17:16:41+01:00</td>\n",
" <td>14691</td>\n",
" <td>1254355</td>\n",
" <td>2023-11-09 16:16:43.012932+01:00</td>\n",
" <td>2023-11-09 16:16:43.012932+01:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>151051 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" id clicked_at link_id customer_id \\\n",
"0 1 2021-03-26 16:30:36+01:00 1 284033 \n",
"1 2 2021-03-26 17:16:34+01:00 2 119768 \n",
"2 272 2021-03-28 20:03:32+02:00 42 113105 \n",
"3 4 2021-03-26 17:43:19+01:00 3 272280 \n",
"4 5 2021-03-26 17:46:00+01:00 3 105095 \n",
"... ... ... ... ... \n",
"151046 243553 2023-11-09 16:34:27+01:00 14666 998 \n",
"151047 243554 2023-11-09 16:34:35+01:00 14670 998 \n",
"151048 243559 2023-11-09 16:51:15+01:00 14686 82923 \n",
"151049 243561 2023-11-09 16:59:42+01:00 14677 82923 \n",
"151050 243564 2023-11-09 17:16:41+01:00 14691 1254355 \n",
"\n",
" created_at updated_at \n",
"0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n",
"1 2021-03-26 16:16:34.950871+01:00 2021-03-26 16:16:34.950871+01:00 \n",
"2 2021-03-28 18:03:32.736394+02:00 2021-03-28 18:03:32.736394+02:00 \n",
"3 2021-03-26 16:43:19.338321+01:00 2021-03-26 16:43:19.338321+01:00 \n",
"4 2021-03-26 16:46:00.502945+01:00 2021-03-26 16:46:00.502945+01:00 \n",
"... ... ... \n",
"151046 2023-11-09 15:34:29.425425+01:00 2023-11-09 15:34:29.425425+01:00 \n",
"151047 2023-11-09 15:34:37.505505+01:00 2023-11-09 15:34:37.505505+01:00 \n",
"151048 2023-11-09 15:51:17.439518+01:00 2023-11-09 15:51:17.439518+01:00 \n",
"151049 2023-11-09 15:59:44.030922+01:00 2023-11-09 15:59:44.030922+01:00 \n",
"151050 2023-11-09 16:16:43.012932+01:00 2023-11-09 16:16:43.012932+01:00 \n",
"\n",
"[151051 rows x 6 columns]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_link_stats"
]
},
{
"cell_type": "markdown",
"id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6",
"metadata": {},
"source": [
"## Exploration variables"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "aaa41688-ea7e-4dba-851c-1f0b0ec43c71",
"metadata": {},
"outputs": [],
"source": [
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
"def suppliers_exploration(suppliers = None) : \n",
" \n",
" # Taux de NaN pour ces colonnes\n",
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
"\n",
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
" 'label_na' : [label_na],\n",
" 'itr_na' : [itr_na],\n",
" 'commission_na' : [commission_na]})\n",
"\n",
" return suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "2fecc2e1-113f-46ed-9065-0b9ee416166e",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "55f6170a-36fb-4efb-9810-f982883660cf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_suppliers</th>\n",
" <th>label_na</th>\n",
" <th>itr_na</th>\n",
" <th>commission_na</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_suppliers label_na itr_na commission_na\n",
"0 9 100.0 100.0 100.0"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "0030fd02-09e3-42f5-9c83-290458a38c29",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"liste_folders = fs.ls(BUCKET)\n",
"\n",
"liste_files = []\n",
"for company_folder in liste_folders : \n",
" liste_files.extend(fs.ls(company_folder))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "6b1736d1-8fd7-4fcc-9431-b8bf0c7b4f2b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
]
}
],
"source": [
"liste_database_select = ['suppliers']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "226b694b-0b00-4167-b69f-3178902254eb",
"metadata": {},
"outputs": [],
"source": [
"# loop to create dataframes from file 2\n",
"def database_loading(database_name = None):\n",
" files_path = database_name\n",
" \n",
" client_number = files_path[0].split(\"/\")[1]\n",
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
" \n",
" for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df\n",
"\n",
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}