Ajout de fonctions pour nettoyage et exploration

This commit is contained in:
Antoine JOUBREL 2024-01-14 16:38:16 +00:00
parent 1852483b72
commit aa88c32722

View File

@ -103,7 +103,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15896/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"/tmp/ipykernel_864/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n"
]
}
@ -474,7 +474,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15896/232847087.py:3: SettingWithCopyWarning: \n",
"/tmp/ipykernel_864/232847087.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
@ -756,7 +756,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 13,
"id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6",
"metadata": {},
"outputs": [
@ -764,7 +764,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15896/302783287.py:3: SettingWithCopyWarning: \n",
"/tmp/ipykernel_864/302783287.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
@ -780,7 +780,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 14,
"id": "4de7e2e2-6da4-4618-8444-b524399c5493",
"metadata": {},
"outputs": [
@ -872,7 +872,7 @@
"8 7 dab"
]
},
"execution_count": 35,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -1108,7 +1108,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 17,
"id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da",
"metadata": {},
"outputs": [
@ -1116,7 +1116,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15896/81842251.py:3: SettingWithCopyWarning: \n",
"/tmp/ipykernel_864/81842251.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
@ -1140,7 +1140,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 18,
"id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c",
"metadata": {
"scrolled": true
@ -1335,7 +1335,7 @@
"[742250 rows x 7 columns]"
]
},
"execution_count": 20,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@ -1346,7 +1346,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 19,
"id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8",
"metadata": {},
"outputs": [
@ -1377,7 +1377,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 20,
"id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd",
"metadata": {},
"outputs": [],
@ -1389,7 +1389,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 21,
"id": "27d18584-228f-4698-85d6-4d23151ea5ed",
"metadata": {},
"outputs": [
@ -1420,7 +1420,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 22,
"id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9",
"metadata": {},
"outputs": [],
@ -1439,7 +1439,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 23,
"id": "e0b8b47a-b321-4a79-823c-36a131a78ac7",
"metadata": {},
"outputs": [],
@ -1459,9 +1459,11 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 24,
"id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d",
"metadata": {},
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
@ -1664,7 +1666,7 @@
"[1826672 rows x 8 columns]"
]
},
"execution_count": 39,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@ -1673,13 +1675,479 @@
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
"id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75",
"metadata": {},
"source": [
"# Utilisation de fonctions"
]
},
{
"cell_type": "markdown",
"id": "27ecf058-23eb-4018-abbd-68c4ebe7c786",
"metadata": {},
"source": [
"## Nettoyage, selection et fusion"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 25,
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
"metadata": {},
"outputs": [],
"source": []
"source": [
"# Fonction de nettoyage et selection\n",
"def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
" # Base des tickets\n",
" tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
"\n",
" # Base des fournisseurs\n",
" suppliers = suppliers[['id', 'name']]\n",
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
"\n",
" # Base des types de billets\n",
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
"\n",
" # Base des achats\n",
" # Nettoyage de la date d'achat\n",
" purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True)\n",
" purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], format = 'ISO8601')\n",
" # Selection des variables\n",
" purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
"\n",
" # Fusions \n",
" # Fusion avec fournisseurs\n",
" ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
" ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
" \n",
" # Fusion avec type de tickets\n",
" ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
" ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
" \n",
" # Fusion avec achats\n",
" ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
" ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n",
"\n",
" return ticket_information"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_864/2452826288.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
"/tmp/ipykernel_864/2452826288.py:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
"/tmp/ipykernel_864/2452826288.py:13: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
]
}
],
"source": [
"df1_ticket_information_test = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "2877f3de-55d6-42d6-ad94-352d3e107862",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>product_id</th>\n",
" <th>is_from_subscription</th>\n",
" <th>supplier_name</th>\n",
" <th>type_of_ticket_name</th>\n",
" <th>children</th>\n",
" <th>purchase_date</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13070859</td>\n",
" <td>225251</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13070860</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13070861</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13070862</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13070863</td>\n",
" <td>224914</td>\n",
" <td>False</td>\n",
" <td>vente en ligne</td>\n",
" <td>Atelier</td>\n",
" <td>pricing_formula</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>48187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826667</th>\n",
" <td>18643847</td>\n",
" <td>350454</td>\n",
" <td>False</td>\n",
" <td>vad</td>\n",
" <td>Billet en nombre</td>\n",
" <td>pricing_formula</td>\n",
" <td>2022-08-02 08:59:17+00:00</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826668</th>\n",
" <td>19853111</td>\n",
" <td>383564</td>\n",
" <td>False</td>\n",
" <td>vad</td>\n",
" <td>Billet en nombre</td>\n",
" <td>pricing_formula</td>\n",
" <td>2022-11-04 14:25:42+00:00</td>\n",
" <td>62763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826669</th>\n",
" <td>19860514</td>\n",
" <td>383751</td>\n",
" <td>False</td>\n",
" <td>vad</td>\n",
" <td>Billet en nombre</td>\n",
" <td>pricing_formula</td>\n",
" <td>2022-11-18 10:47:26+00:00</td>\n",
" <td>1195566</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826670</th>\n",
" <td>19860515</td>\n",
" <td>383751</td>\n",
" <td>False</td>\n",
" <td>vad</td>\n",
" <td>Billet en nombre</td>\n",
" <td>pricing_formula</td>\n",
" <td>2022-11-18 10:47:26+00:00</td>\n",
" <td>1195566</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826671</th>\n",
" <td>19860516</td>\n",
" <td>383751</td>\n",
" <td>False</td>\n",
" <td>vad</td>\n",
" <td>Billet en nombre</td>\n",
" <td>pricing_formula</td>\n",
" <td>2022-11-18 10:47:26+00:00</td>\n",
" <td>1195566</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1826672 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" ticket_id product_id is_from_subscription supplier_name \\\n",
"0 13070859 225251 False vente en ligne \n",
"1 13070860 224914 False vente en ligne \n",
"2 13070861 224914 False vente en ligne \n",
"3 13070862 224914 False vente en ligne \n",
"4 13070863 224914 False vente en ligne \n",
"... ... ... ... ... \n",
"1826667 18643847 350454 False vad \n",
"1826668 19853111 383564 False vad \n",
"1826669 19860514 383751 False vad \n",
"1826670 19860515 383751 False vad \n",
"1826671 19860516 383751 False vad \n",
"\n",
" type_of_ticket_name children purchase_date \\\n",
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
"... ... ... ... \n",
"1826667 Billet en nombre pricing_formula 2022-08-02 08:59:17+00:00 \n",
"1826668 Billet en nombre pricing_formula 2022-11-04 14:25:42+00:00 \n",
"1826669 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
"1826670 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
"1826671 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
"\n",
" customer_id \n",
"0 48187 \n",
"1 48187 \n",
"2 48187 \n",
"3 48187 \n",
"4 48187 \n",
"... ... \n",
"1826667 41 \n",
"1826668 62763 \n",
"1826669 1195566 \n",
"1826670 1195566 \n",
"1826671 1195566 \n",
"\n",
"[1826672 rows x 8 columns]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
"id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6",
"metadata": {},
"source": [
"## Exploration variables"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "aaa41688-ea7e-4dba-851c-1f0b0ec43c71",
"metadata": {},
"outputs": [],
"source": [
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
"def suppliers_exploration(suppliers = None) : \n",
" \n",
" # Taux de NaN pour ces colonnes\n",
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
"\n",
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
" 'label_na' : [label_na],\n",
" 'itr_na' : [itr_na],\n",
" 'commission_na' : [commission_na]})\n",
"\n",
" return suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "2fecc2e1-113f-46ed-9065-0b9ee416166e",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "55f6170a-36fb-4efb-9810-f982883660cf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_suppliers</th>\n",
" <th>label_na</th>\n",
" <th>itr_na</th>\n",
" <th>commission_na</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_suppliers label_na itr_na commission_na\n",
"0 9 100.0 100.0 100.0"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "0030fd02-09e3-42f5-9c83-290458a38c29",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"liste_folders = fs.ls(BUCKET)\n",
"\n",
"liste_files = []\n",
"for company_folder in liste_folders : \n",
" liste_files.extend(fs.ls(company_folder))"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "6b1736d1-8fd7-4fcc-9431-b8bf0c7b4f2b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
]
}
],
"source": [
"liste_database_select = ['suppliers']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "226b694b-0b00-4167-b69f-3178902254eb",
"metadata": {},
"outputs": [],
"source": [
"# loop to create dataframes from file 2\n",
"def database_loading(database_name = None):\n",
" files_path = database_name\n",
" \n",
" client_number = files_path[0].split(\"/\")[1]\n",
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
" \n",
" for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df\n",
"\n",
" "
]
}
],
"metadata": {