Ajout de fonctions pour nettoyage et exploration
This commit is contained in:
parent
1852483b72
commit
aa88c32722
|
@ -103,7 +103,7 @@
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"/tmp/ipykernel_15896/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
"/tmp/ipykernel_864/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||||
" df = pd.read_csv(file_in)\n"
|
" df = pd.read_csv(file_in)\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -474,7 +474,7 @@
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"/tmp/ipykernel_15896/232847087.py:3: SettingWithCopyWarning: \n",
|
"/tmp/ipykernel_864/232847087.py:3: SettingWithCopyWarning: \n",
|
||||||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||||||
"\n",
|
"\n",
|
||||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||||
|
@ -756,7 +756,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 34,
|
"execution_count": 13,
|
||||||
"id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6",
|
"id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -764,7 +764,7 @@
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"/tmp/ipykernel_15896/302783287.py:3: SettingWithCopyWarning: \n",
|
"/tmp/ipykernel_864/302783287.py:3: SettingWithCopyWarning: \n",
|
||||||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||||||
"\n",
|
"\n",
|
||||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||||
|
@ -780,7 +780,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 35,
|
"execution_count": 14,
|
||||||
"id": "4de7e2e2-6da4-4618-8444-b524399c5493",
|
"id": "4de7e2e2-6da4-4618-8444-b524399c5493",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -872,7 +872,7 @@
|
||||||
"8 7 dab"
|
"8 7 dab"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 35,
|
"execution_count": 14,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1108,7 +1108,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": 17,
|
||||||
"id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da",
|
"id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1116,7 +1116,7 @@
|
||||||
"name": "stderr",
|
"name": "stderr",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"/tmp/ipykernel_15896/81842251.py:3: SettingWithCopyWarning: \n",
|
"/tmp/ipykernel_864/81842251.py:3: SettingWithCopyWarning: \n",
|
||||||
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||||||
"\n",
|
"\n",
|
||||||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||||
|
@ -1140,7 +1140,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": 18,
|
||||||
"id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c",
|
"id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
|
@ -1335,7 +1335,7 @@
|
||||||
"[742250 rows x 7 columns]"
|
"[742250 rows x 7 columns]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 20,
|
"execution_count": 18,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1346,7 +1346,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 21,
|
"execution_count": 19,
|
||||||
"id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8",
|
"id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1377,7 +1377,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 22,
|
"execution_count": 20,
|
||||||
"id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd",
|
"id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
@ -1389,7 +1389,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": 21,
|
||||||
"id": "27d18584-228f-4698-85d6-4d23151ea5ed",
|
"id": "27d18584-228f-4698-85d6-4d23151ea5ed",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
|
@ -1420,7 +1420,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 24,
|
"execution_count": 22,
|
||||||
"id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9",
|
"id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
@ -1439,7 +1439,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 38,
|
"execution_count": 23,
|
||||||
"id": "e0b8b47a-b321-4a79-823c-36a131a78ac7",
|
"id": "e0b8b47a-b321-4a79-823c-36a131a78ac7",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
@ -1459,9 +1459,11 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 39,
|
"execution_count": 24,
|
||||||
"id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d",
|
"id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d",
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
|
@ -1664,7 +1666,7 @@
|
||||||
"[1826672 rows x 8 columns]"
|
"[1826672 rows x 8 columns]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 39,
|
"execution_count": 24,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -1673,13 +1675,479 @@
|
||||||
"df1_ticket_information"
|
"df1_ticket_information"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Utilisation de fonctions"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "27ecf058-23eb-4018-abbd-68c4ebe7c786",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Nettoyage, selection et fusion"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 25,
|
||||||
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
|
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": [
|
||||||
|
"# Fonction de nettoyage et selection\n",
|
||||||
|
"def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
|
||||||
|
" # Base des tickets\n",
|
||||||
|
" tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
|
||||||
|
" tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
||||||
|
"\n",
|
||||||
|
" # Base des fournisseurs\n",
|
||||||
|
" suppliers = suppliers[['id', 'name']]\n",
|
||||||
|
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
||||||
|
"\n",
|
||||||
|
" # Base des types de billets\n",
|
||||||
|
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
|
||||||
|
" type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
|
||||||
|
"\n",
|
||||||
|
" # Base des achats\n",
|
||||||
|
" # Nettoyage de la date d'achat\n",
|
||||||
|
" purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True)\n",
|
||||||
|
" purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], format = 'ISO8601')\n",
|
||||||
|
" # Selection des variables\n",
|
||||||
|
" purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
|
||||||
|
"\n",
|
||||||
|
" # Fusions \n",
|
||||||
|
" # Fusion avec fournisseurs\n",
|
||||||
|
" ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
|
||||||
|
" ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
|
||||||
|
" \n",
|
||||||
|
" # Fusion avec type de tickets\n",
|
||||||
|
" ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
|
||||||
|
" ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
|
||||||
|
" \n",
|
||||||
|
" # Fusion avec achats\n",
|
||||||
|
" ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
|
||||||
|
" ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n",
|
||||||
|
"\n",
|
||||||
|
" return ticket_information"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/tmp/ipykernel_864/2452826288.py:5: SettingWithCopyWarning: \n",
|
||||||
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||||||
|
"\n",
|
||||||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||||
|
" tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
|
||||||
|
"/tmp/ipykernel_864/2452826288.py:9: SettingWithCopyWarning: \n",
|
||||||
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||||||
|
"\n",
|
||||||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||||
|
" suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
|
||||||
|
"/tmp/ipykernel_864/2452826288.py:13: SettingWithCopyWarning: \n",
|
||||||
|
"A value is trying to be set on a copy of a slice from a DataFrame\n",
|
||||||
|
"\n",
|
||||||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||||||
|
" type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df1_ticket_information_test = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"id": "2877f3de-55d6-42d6-ad94-352d3e107862",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>ticket_id</th>\n",
|
||||||
|
" <th>product_id</th>\n",
|
||||||
|
" <th>is_from_subscription</th>\n",
|
||||||
|
" <th>supplier_name</th>\n",
|
||||||
|
" <th>type_of_ticket_name</th>\n",
|
||||||
|
" <th>children</th>\n",
|
||||||
|
" <th>purchase_date</th>\n",
|
||||||
|
" <th>customer_id</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>13070859</td>\n",
|
||||||
|
" <td>225251</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vente en ligne</td>\n",
|
||||||
|
" <td>Atelier</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||||||
|
" <td>48187</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>13070860</td>\n",
|
||||||
|
" <td>224914</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vente en ligne</td>\n",
|
||||||
|
" <td>Atelier</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||||||
|
" <td>48187</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>13070861</td>\n",
|
||||||
|
" <td>224914</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vente en ligne</td>\n",
|
||||||
|
" <td>Atelier</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||||||
|
" <td>48187</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>13070862</td>\n",
|
||||||
|
" <td>224914</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vente en ligne</td>\n",
|
||||||
|
" <td>Atelier</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||||||
|
" <td>48187</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>13070863</td>\n",
|
||||||
|
" <td>224914</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vente en ligne</td>\n",
|
||||||
|
" <td>Atelier</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2018-12-28 14:47:50+00:00</td>\n",
|
||||||
|
" <td>48187</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1826667</th>\n",
|
||||||
|
" <td>18643847</td>\n",
|
||||||
|
" <td>350454</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vad</td>\n",
|
||||||
|
" <td>Billet en nombre</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2022-08-02 08:59:17+00:00</td>\n",
|
||||||
|
" <td>41</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1826668</th>\n",
|
||||||
|
" <td>19853111</td>\n",
|
||||||
|
" <td>383564</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vad</td>\n",
|
||||||
|
" <td>Billet en nombre</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2022-11-04 14:25:42+00:00</td>\n",
|
||||||
|
" <td>62763</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1826669</th>\n",
|
||||||
|
" <td>19860514</td>\n",
|
||||||
|
" <td>383751</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vad</td>\n",
|
||||||
|
" <td>Billet en nombre</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2022-11-18 10:47:26+00:00</td>\n",
|
||||||
|
" <td>1195566</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1826670</th>\n",
|
||||||
|
" <td>19860515</td>\n",
|
||||||
|
" <td>383751</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vad</td>\n",
|
||||||
|
" <td>Billet en nombre</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2022-11-18 10:47:26+00:00</td>\n",
|
||||||
|
" <td>1195566</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1826671</th>\n",
|
||||||
|
" <td>19860516</td>\n",
|
||||||
|
" <td>383751</td>\n",
|
||||||
|
" <td>False</td>\n",
|
||||||
|
" <td>vad</td>\n",
|
||||||
|
" <td>Billet en nombre</td>\n",
|
||||||
|
" <td>pricing_formula</td>\n",
|
||||||
|
" <td>2022-11-18 10:47:26+00:00</td>\n",
|
||||||
|
" <td>1195566</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>1826672 rows × 8 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" ticket_id product_id is_from_subscription supplier_name \\\n",
|
||||||
|
"0 13070859 225251 False vente en ligne \n",
|
||||||
|
"1 13070860 224914 False vente en ligne \n",
|
||||||
|
"2 13070861 224914 False vente en ligne \n",
|
||||||
|
"3 13070862 224914 False vente en ligne \n",
|
||||||
|
"4 13070863 224914 False vente en ligne \n",
|
||||||
|
"... ... ... ... ... \n",
|
||||||
|
"1826667 18643847 350454 False vad \n",
|
||||||
|
"1826668 19853111 383564 False vad \n",
|
||||||
|
"1826669 19860514 383751 False vad \n",
|
||||||
|
"1826670 19860515 383751 False vad \n",
|
||||||
|
"1826671 19860516 383751 False vad \n",
|
||||||
|
"\n",
|
||||||
|
" type_of_ticket_name children purchase_date \\\n",
|
||||||
|
"0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||||||
|
"1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||||||
|
"2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||||||
|
"3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||||||
|
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
|
||||||
|
"... ... ... ... \n",
|
||||||
|
"1826667 Billet en nombre pricing_formula 2022-08-02 08:59:17+00:00 \n",
|
||||||
|
"1826668 Billet en nombre pricing_formula 2022-11-04 14:25:42+00:00 \n",
|
||||||
|
"1826669 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
|
||||||
|
"1826670 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
|
||||||
|
"1826671 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n",
|
||||||
|
"\n",
|
||||||
|
" customer_id \n",
|
||||||
|
"0 48187 \n",
|
||||||
|
"1 48187 \n",
|
||||||
|
"2 48187 \n",
|
||||||
|
"3 48187 \n",
|
||||||
|
"4 48187 \n",
|
||||||
|
"... ... \n",
|
||||||
|
"1826667 41 \n",
|
||||||
|
"1826668 62763 \n",
|
||||||
|
"1826669 1195566 \n",
|
||||||
|
"1826670 1195566 \n",
|
||||||
|
"1826671 1195566 \n",
|
||||||
|
"\n",
|
||||||
|
"[1826672 rows x 8 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df1_ticket_information"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Exploration variables"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 41,
|
||||||
|
"id": "aaa41688-ea7e-4dba-851c-1f0b0ec43c71",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
|
||||||
|
"def suppliers_exploration(suppliers = None) : \n",
|
||||||
|
" \n",
|
||||||
|
" # Taux de NaN pour ces colonnes\n",
|
||||||
|
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
|
||||||
|
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
|
||||||
|
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
|
||||||
|
"\n",
|
||||||
|
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
|
||||||
|
" 'label_na' : [label_na],\n",
|
||||||
|
" 'itr_na' : [itr_na],\n",
|
||||||
|
" 'commission_na' : [commission_na]})\n",
|
||||||
|
"\n",
|
||||||
|
" return suppliers_desc"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 42,
|
||||||
|
"id": "2fecc2e1-113f-46ed-9065-0b9ee416166e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 43,
|
||||||
|
"id": "55f6170a-36fb-4efb-9810-f982883660cf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>nb_suppliers</th>\n",
|
||||||
|
" <th>label_na</th>\n",
|
||||||
|
" <th>itr_na</th>\n",
|
||||||
|
" <th>commission_na</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>9</td>\n",
|
||||||
|
" <td>100.0</td>\n",
|
||||||
|
" <td>100.0</td>\n",
|
||||||
|
" <td>100.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" nb_suppliers label_na itr_na commission_na\n",
|
||||||
|
"0 9 100.0 100.0 100.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 43,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df1_suppliers_desc"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 47,
|
||||||
|
"id": "0030fd02-09e3-42f5-9c83-290458a38c29",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"BUCKET = \"bdc2324-data\"\n",
|
||||||
|
"liste_folders = fs.ls(BUCKET)\n",
|
||||||
|
"\n",
|
||||||
|
"liste_files = []\n",
|
||||||
|
"for company_folder in liste_folders : \n",
|
||||||
|
" liste_files.extend(fs.ls(company_folder))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 51,
|
||||||
|
"id": "6b1736d1-8fd7-4fcc-9431-b8bf0c7b4f2b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"liste_database_select = ['suppliers']\n",
|
||||||
|
"\n",
|
||||||
|
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
|
||||||
|
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
|
||||||
|
"\n",
|
||||||
|
"# Afficher le résultat\n",
|
||||||
|
"print(liste_suppliers)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "226b694b-0b00-4167-b69f-3178902254eb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# loop to create dataframes from file 2\n",
|
||||||
|
"def database_loading(database_name = None):\n",
|
||||||
|
" files_path = database_name\n",
|
||||||
|
" \n",
|
||||||
|
" client_number = files_path[0].split(\"/\")[1]\n",
|
||||||
|
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
|
||||||
|
" \n",
|
||||||
|
" for i in range(len(files_path)) :\n",
|
||||||
|
" current_path = files_path[i]\n",
|
||||||
|
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
|
||||||
|
" df = pd.read_csv(file_in)\n",
|
||||||
|
" # the pattern of the name is df1xxx\n",
|
||||||
|
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
|
||||||
|
" globals()[nom_dataframe] = df\n",
|
||||||
|
"\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user