diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb
index ba13c22..20c5a03 100644
--- a/0_Cleaning_and_merge.ipynb
+++ b/0_Cleaning_and_merge.ipynb
@@ -10,7 +10,7 @@
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": 1,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
@@ -19,7 +19,8 @@
"import numpy as np\n",
"import os\n",
"import s3fs\n",
- "import re"
+ "import re\n",
+ "import warnings"
]
},
{
@@ -32,7 +33,7 @@
},
{
"cell_type": "code",
- "execution_count": 82,
+ "execution_count": 2,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
@@ -42,6 +43,17 @@
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a9b84234-d5df-4c43-a9cd-80cfe2f1e34d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Ignore warning\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
{
"cell_type": "markdown",
"id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a",
@@ -60,7 +72,7 @@
},
{
"cell_type": "code",
- "execution_count": 84,
+ "execution_count": 3,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
"outputs": [],
@@ -71,21 +83,13 @@
},
{
"cell_type": "code",
- "execution_count": 86,
+ "execution_count": 4,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_1018/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
- " df = pd.read_csv(file_in)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# loop to create dataframes from liste\n",
+ "\n",
"files_path = liste_database\n",
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
@@ -110,7 +114,7 @@
},
{
"cell_type": "code",
- "execution_count": 88,
+ "execution_count": 5,
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
"metadata": {},
"outputs": [],
@@ -151,7 +155,7 @@
},
{
"cell_type": "code",
- "execution_count": 90,
+ "execution_count": 6,
"id": "7e7b90ce-da54-4f00-bc34-64c543b0858f",
"metadata": {},
"outputs": [],
@@ -173,7 +177,7 @@
},
{
"cell_type": "code",
- "execution_count": 92,
+ "execution_count": 7,
"id": "03329e32-00a5-42c8-9470-75f7b6216ccd",
"metadata": {},
"outputs": [],
@@ -191,7 +195,7 @@
},
{
"cell_type": "code",
- "execution_count": 94,
+ "execution_count": 8,
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
"metadata": {},
"outputs": [],
@@ -205,6 +209,7 @@
" # Base des fournisseurs\n",
" suppliers = suppliers[['id', 'name']]\n",
" suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
+ " suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')\n",
"\n",
" # Base des types de billets\n",
" type_ofs = type_ofs[['id', 'name', 'children']]\n",
@@ -234,39 +239,17 @@
},
{
"cell_type": "code",
- "execution_count": 96,
+ "execution_count": 9,
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_1018/1591303091.py:5: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
- "/tmp/ipykernel_1018/1591303091.py:9: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
- "/tmp/ipykernel_1018/1591303091.py:13: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
]
},
{
"cell_type": "code",
- "execution_count": 98,
+ "execution_count": 10,
"id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
"metadata": {},
"outputs": [
@@ -377,7 +360,7 @@
"4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 "
]
},
- "execution_count": 98,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -396,7 +379,7 @@
},
{
"cell_type": "code",
- "execution_count": 100,
+ "execution_count": 11,
"id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
"metadata": {},
"outputs": [],
@@ -425,185 +408,14 @@
},
{
"cell_type": "code",
- "execution_count": 102,
+ "execution_count": 12,
"id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_1018/3848597476.py:4: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
]
},
- {
- "cell_type": "code",
- "execution_count": 104,
- "id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id \n",
- " \n",
- " \n",
- " target_name \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " consentement optin mediation specialisee \n",
- " 150000 \n",
- " \n",
- " \n",
- " consentement optin jeune public \n",
- " 149979 \n",
- " \n",
- " \n",
- " consentement optin b2c \n",
- " 108909 \n",
- " \n",
- " \n",
- " Arenametrix_bascule tel vers sib \n",
- " 35216 \n",
- " \n",
- " \n",
- " consentement optout b2c \n",
- " 34523 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " customer_id\n",
- "target_name \n",
- "consentement optin mediation specialisee 150000\n",
- "consentement optin jeune public 149979\n",
- "consentement optin b2c 108909\n",
- "Arenametrix_bascule tel vers sib 35216\n",
- "consentement optout b2c 34523"
- ]
- },
- "execution_count": 104,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False).head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 106,
- "id": "4417ff51-f501-4ab9-a192-4ab75764a8ed",
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id \n",
- " \n",
- " \n",
- " target_name \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " Arenametrix_bascule tel vers sib \n",
- " 35216 \n",
- " \n",
- " \n",
- " Autres_interet_exposition \n",
- " 1021 \n",
- " \n",
- " \n",
- " COM Inscrits NL générale (historique) \n",
- " 23005 \n",
- " \n",
- " \n",
- " Contacts_prenomsdoubles \n",
- " 11643 \n",
- " \n",
- " \n",
- " DDCP MD Procès du Siècle \n",
- " 1684 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " customer_id\n",
- "target_name \n",
- "Arenametrix_bascule tel vers sib 35216\n",
- "Autres_interet_exposition 1021\n",
- "COM Inscrits NL générale (historique) 23005\n",
- "Contacts_prenomsdoubles 11643\n",
- "DDCP MD Procès du Siècle 1684"
- ]
- },
- "execution_count": 106,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n",
- "df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000].head()"
- ]
- },
{
"cell_type": "markdown",
"id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f",
@@ -614,7 +426,7 @@
},
{
"cell_type": "code",
- "execution_count": 108,
+ "execution_count": 13,
"id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
"metadata": {},
"outputs": [],
@@ -639,42 +451,17 @@
},
{
"cell_type": "code",
- "execution_count": 110,
+ "execution_count": 14,
"id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_1018/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
- "/tmp/ipykernel_1018/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
- "/tmp/ipykernel_1018/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
]
},
{
"cell_type": "code",
- "execution_count": 112,
+ "execution_count": 15,
"id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
"metadata": {
"scrolled": true
@@ -794,7 +581,7 @@
"4 404 2021-03-27 23:00:00+00:00 "
]
},
- "execution_count": 112,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -803,159 +590,12 @@
"df1_campaigns_information.head()"
]
},
- {
- "cell_type": "code",
- "execution_count": 114,
- "id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
- "metadata": {},
- "outputs": [],
- "source": [
- "def campaigns_kpi_function(campaigns_information = None):\n",
- " # Nombre de campagnes de mails\n",
- " nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
- " nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
- " # Temps d'ouverture en min moyen \n",
- " campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
- " time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
- "\n",
- " # Nombre de mail ouvert \n",
- " opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
- " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
- " opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
- " opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
- "\n",
- " # Fusion des indicateurs\n",
- " campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
- " campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
- "\n",
- " # Remplir les NaN : nb_campaigns_opened\n",
- " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n",
- "\n",
- " # Remplir les NaT : time_to_open (??)\n",
- "\n",
- " return campaigns_reduced\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 116,
- "id": "24537647-bc29-4777-9848-ac4120a4aa60",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_1018/3700263836.py:11: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n"
- ]
- }
- ],
- "source": [
- "df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 118,
- "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id \n",
- " nb_campaigns \n",
- " nb_campaigns_opened \n",
- " time_to_open \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 2 \n",
- " 4 \n",
- " 0.0 \n",
- " NaT \n",
- " \n",
- " \n",
- " 1 \n",
- " 3 \n",
- " 222 \n",
- " 124.0 \n",
- " 1 days 00:28:30.169354838 \n",
- " \n",
- " \n",
- " 2 \n",
- " 4 \n",
- " 7 \n",
- " 7.0 \n",
- " 1 days 04:31:01.428571428 \n",
- " \n",
- " \n",
- " 3 \n",
- " 5 \n",
- " 4 \n",
- " 0.0 \n",
- " NaT \n",
- " \n",
- " \n",
- " 4 \n",
- " 6 \n",
- " 20 \n",
- " 0.0 \n",
- " NaT \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
- "0 2 4 0.0 NaT\n",
- "1 3 222 124.0 1 days 00:28:30.169354838\n",
- "2 4 7 7.0 1 days 04:31:01.428571428\n",
- "3 5 4 0.0 NaT\n",
- "4 6 20 0.0 NaT"
- ]
- },
- "execution_count": 118,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df1_campaigns_kpi.head()"
- ]
- },
{
"cell_type": "markdown",
"id": "56520a97-ede8-4920-a211-3b5b136af33d",
"metadata": {},
"source": [
- "## Create Products Table"
+ "## Product area"
]
},
{
@@ -968,7 +608,7 @@
},
{
"cell_type": "code",
- "execution_count": 120,
+ "execution_count": 16,
"id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
"metadata": {},
"outputs": [],
@@ -979,7 +619,7 @@
},
{
"cell_type": "code",
- "execution_count": 122,
+ "execution_count": 17,
"id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
"metadata": {},
"outputs": [],
@@ -1050,7 +690,7 @@
},
{
"cell_type": "code",
- "execution_count": 124,
+ "execution_count": 18,
"id": "350b09b9-451f-4d47-81fe-f34b892db027",
"metadata": {},
"outputs": [],
@@ -1138,7 +778,7 @@
},
{
"cell_type": "code",
- "execution_count": 126,
+ "execution_count": 19,
"id": "0fccc8ef-e575-4857-a401-94a7274394df",
"metadata": {},
"outputs": [
@@ -1291,7 +931,7 @@
"4 indiv entrées tp "
]
},
- "execution_count": 126,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -1303,7 +943,7 @@
},
{
"cell_type": "code",
- "execution_count": 128,
+ "execution_count": 20,
"id": "779d8aaf-6668-4f66-8852-847304407ea3",
"metadata": {},
"outputs": [
@@ -1473,7 +1113,7 @@
"4 spectacle vivant mucem "
]
},
- "execution_count": 128,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
@@ -1485,7 +1125,7 @@
},
{
"cell_type": "code",
- "execution_count": 130,
+ "execution_count": 21,
"id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
"metadata": {},
"outputs": [
@@ -1584,7 +1224,7 @@
"4 37 383 269 1"
]
},
- "execution_count": 130,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -1604,7 +1244,7 @@
},
{
"cell_type": "code",
- "execution_count": 132,
+ "execution_count": 22,
"id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
"metadata": {},
"outputs": [],
@@ -1632,7 +1272,7 @@
},
{
"cell_type": "code",
- "execution_count": 134,
+ "execution_count": 23,
"id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
"metadata": {},
"outputs": [
@@ -1686,7 +1326,7 @@
" id_representation_cap \n",
" season_id \n",
" facility_id \n",
- " event_type_id \n",
+ " ... \n",
" event_type_key_id \n",
" facility_key_id \n",
" street_id \n",
@@ -1712,7 +1352,7 @@
" 8789 \n",
" 4 \n",
" 1 \n",
- " 2 \n",
+ " ... \n",
" 5 \n",
" 1 \n",
" 1 \n",
@@ -1736,7 +1376,7 @@
" 390 \n",
" 2 \n",
" 1 \n",
- " 2 \n",
+ " ... \n",
" 2 \n",
" 1 \n",
" 1 \n",
@@ -1760,7 +1400,7 @@
" 395 \n",
" 2 \n",
" 1 \n",
- " 2 \n",
+ " ... \n",
" 2 \n",
" 1 \n",
" 1 \n",
@@ -1784,7 +1424,7 @@
" 120199 \n",
" 1754 \n",
" 1 \n",
- " 2 \n",
+ " ... \n",
" 4 \n",
" 1 \n",
" 1 \n",
@@ -1808,7 +1448,7 @@
" 21 \n",
" 4 \n",
" 1 \n",
- " 3 \n",
+ " ... \n",
" 6 \n",
" 1 \n",
" 1 \n",
@@ -1822,6 +1462,7 @@
" \n",
" \n",
"\n",
+ "5 rows × 21 columns
\n",
""
],
"text/plain": [
@@ -1839,19 +1480,19 @@
"3 156773 1 12365 120199 \n",
"4 1175 1 8 21 \n",
"\n",
- " season_id facility_id event_type_id event_type_key_id facility_key_id \\\n",
- "0 4 1 2 5 1 \n",
- "1 2 1 2 2 1 \n",
- "2 2 1 2 2 1 \n",
- "3 1754 1 2 4 1 \n",
- "4 4 1 3 6 1 \n",
+ " season_id facility_id ... event_type_key_id facility_key_id street_id \\\n",
+ "0 4 1 ... 5 1 1 \n",
+ "1 2 1 ... 2 1 1 \n",
+ "2 2 1 ... 2 1 1 \n",
+ "3 1754 1 ... 4 1 1 \n",
+ "4 4 1 ... 6 1 1 \n",
"\n",
- " street_id amount is_full_price name_categories \\\n",
- "0 1 9.0 False indiv activité tr \n",
- "1 1 9.5 False indiv entrées tp \n",
- "2 1 11.5 False indiv entrées tp \n",
- "3 1 8.0 False indiv entrées tr \n",
- "4 1 8.5 False indiv entrées tp \n",
+ " amount is_full_price name_categories \\\n",
+ "0 9.0 False indiv activité tr \n",
+ "1 9.5 False indiv entrées tp \n",
+ "2 11.5 False indiv entrées tp \n",
+ "3 8.0 False indiv entrées tr \n",
+ "4 8.5 False indiv entrées tp \n",
"\n",
" name_events name_seasons \\\n",
"0 visite-jeu \"le classico des minots\" (1h30) 2017 \n",
@@ -1865,10 +1506,12 @@
"1 offre muséale individuel mucem \n",
"2 offre muséale individuel mucem \n",
"3 offre muséale individuel mucem \n",
- "4 non défini mucem "
+ "4 non défini mucem \n",
+ "\n",
+ "[5 rows x 21 columns]"
]
},
- "execution_count": 134,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@@ -1880,19 +1523,82 @@
},
{
"cell_type": "code",
- "execution_count": 136,
+ "execution_count": 24,
"id": "98f78cd5-b694-4cc6-b033-20170aa13e8d",
"metadata": {},
"outputs": [],
"source": [
"# Fusion liée au product\n",
- "df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')"
+ "df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')\n",
+ "\n",
+ "# Selection des variables d'intérêts\n",
+ "df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d7c3668a-c016-4bd0-837e-04af328ff14f",
+ "metadata": {},
+ "source": [
+ "# Construction des variables explicatives"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "314f1b7f-ae48-4c6f-8469-9ce879043243",
+ "metadata": {},
+ "source": [
+ "## KPI campaigns"
]
},
{
"cell_type": "code",
- "execution_count": 137,
- "id": "52db7bcb-3fb7-48e5-b612-4e22bdab4a94",
+ "execution_count": 25,
+ "id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def campaigns_kpi_function(campaigns_information = None):\n",
+ " # Nombre de campagnes de mails\n",
+ " nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
+ " nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
+ " # Temps d'ouverture en min moyen \n",
+ " campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
+ " time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
+ "\n",
+ " # Nombre de mail ouvert \n",
+ " opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
+ " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
+ " opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
+ " opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
+ "\n",
+ " # Fusion des indicateurs\n",
+ " campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
+ " campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
+ "\n",
+ " # Remplir les NaN : nb_campaigns_opened\n",
+ " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n",
+ "\n",
+ " # Remplir les NaT : time_to_open (??)\n",
+ "\n",
+ " return campaigns_reduced\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "24537647-bc29-4777-9848-ac4120a4aa60",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
"metadata": {},
"outputs": [
{
@@ -1916,260 +1622,68 @@
" \n",
" \n",
" \n",
- " ticket_id \n",
- " product_id \n",
- " is_from_subscription \n",
- " supplier_name \n",
- " type_of_ticket_name \n",
- " children \n",
- " purchase_date \n",
" customer_id \n",
- " id_products \n",
- " representation_id \n",
- " pricing_formula_id \n",
- " category_id \n",
- " products_group_id \n",
- " product_pack_id \n",
- " event_id \n",
- " id_representation_cap \n",
- " season_id \n",
- " facility_id \n",
- " event_type_id \n",
- " event_type_key_id \n",
- " facility_key_id \n",
- " street_id \n",
- " amount \n",
- " is_full_price \n",
- " name_categories \n",
- " name_events \n",
- " name_seasons \n",
- " name_event_types \n",
- " name_facilities \n",
+ " nb_campaigns \n",
+ " nb_campaigns_opened \n",
+ " time_to_open \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
- " 13070859 \n",
- " 225251 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " 225251 \n",
- " 113676 \n",
- " 28 \n",
- " 13 \n",
- " 224768 \n",
- " 1 \n",
- " 197 \n",
- " 172742 \n",
- " 16 \n",
- " 1 \n",
+ " 2 \n",
" 4 \n",
- " 4 \n",
- " 1 \n",
- " 1 \n",
- " 8.0 \n",
- " False \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
- " spectacle vivant \n",
- " mucem \n",
+ " 0.0 \n",
+ " NaT \n",
" \n",
" \n",
" 1 \n",
- " 13070855 \n",
- " 225251 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " 225251 \n",
- " 113676 \n",
- " 28 \n",
- " 13 \n",
- " 224768 \n",
- " 1 \n",
- " 197 \n",
- " 172742 \n",
- " 16 \n",
- " 1 \n",
- " 4 \n",
- " 4 \n",
- " 1 \n",
- " 1 \n",
- " 8.0 \n",
- " False \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
- " spectacle vivant \n",
- " mucem \n",
+ " 3 \n",
+ " 222 \n",
+ " 124.0 \n",
+ " 1 days 00:28:30.169354838 \n",
" \n",
" \n",
" 2 \n",
- " 13070856 \n",
- " 225251 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " 225251 \n",
- " 113676 \n",
- " 28 \n",
- " 13 \n",
- " 224768 \n",
- " 1 \n",
- " 197 \n",
- " 172742 \n",
- " 16 \n",
- " 1 \n",
" 4 \n",
- " 4 \n",
- " 1 \n",
- " 1 \n",
- " 8.0 \n",
- " False \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
- " spectacle vivant \n",
- " mucem \n",
+ " 7 \n",
+ " 7.0 \n",
+ " 1 days 04:31:01.428571428 \n",
" \n",
" \n",
" 3 \n",
- " 13070857 \n",
- " 225251 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " 225251 \n",
- " 113676 \n",
- " 28 \n",
- " 13 \n",
- " 224768 \n",
- " 1 \n",
- " 197 \n",
- " 172742 \n",
- " 16 \n",
- " 1 \n",
+ " 5 \n",
" 4 \n",
- " 4 \n",
- " 1 \n",
- " 1 \n",
- " 8.0 \n",
- " False \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
- " spectacle vivant \n",
- " mucem \n",
+ " 0.0 \n",
+ " NaT \n",
" \n",
" \n",
" 4 \n",
- " 13070858 \n",
- " 225251 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " 225251 \n",
- " 113676 \n",
- " 28 \n",
- " 13 \n",
- " 224768 \n",
- " 1 \n",
- " 197 \n",
- " 172742 \n",
- " 16 \n",
- " 1 \n",
- " 4 \n",
- " 4 \n",
- " 1 \n",
- " 1 \n",
- " 8.0 \n",
- " False \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
- " spectacle vivant \n",
- " mucem \n",
+ " 6 \n",
+ " 20 \n",
+ " 0.0 \n",
+ " NaT \n",
" \n",
" \n",
"\n",
""
],
"text/plain": [
- " ticket_id product_id is_from_subscription supplier_name \\\n",
- "0 13070859 225251 False vente en ligne \n",
- "1 13070855 225251 False vente en ligne \n",
- "2 13070856 225251 False vente en ligne \n",
- "3 13070857 225251 False vente en ligne \n",
- "4 13070858 225251 False vente en ligne \n",
- "\n",
- " type_of_ticket_name children purchase_date customer_id \\\n",
- "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
- "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
- "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
- "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
- "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 48187 \n",
- "\n",
- " id_products representation_id pricing_formula_id category_id \\\n",
- "0 225251 113676 28 13 \n",
- "1 225251 113676 28 13 \n",
- "2 225251 113676 28 13 \n",
- "3 225251 113676 28 13 \n",
- "4 225251 113676 28 13 \n",
- "\n",
- " products_group_id product_pack_id event_id id_representation_cap \\\n",
- "0 224768 1 197 172742 \n",
- "1 224768 1 197 172742 \n",
- "2 224768 1 197 172742 \n",
- "3 224768 1 197 172742 \n",
- "4 224768 1 197 172742 \n",
- "\n",
- " season_id facility_id event_type_id event_type_key_id facility_key_id \\\n",
- "0 16 1 4 4 1 \n",
- "1 16 1 4 4 1 \n",
- "2 16 1 4 4 1 \n",
- "3 16 1 4 4 1 \n",
- "4 16 1 4 4 1 \n",
- "\n",
- " street_id amount is_full_price name_categories name_events \\\n",
- "0 1 8.0 False indiv prog enfant l'école des magiciens \n",
- "1 1 8.0 False indiv prog enfant l'école des magiciens \n",
- "2 1 8.0 False indiv prog enfant l'école des magiciens \n",
- "3 1 8.0 False indiv prog enfant l'école des magiciens \n",
- "4 1 8.0 False indiv prog enfant l'école des magiciens \n",
- "\n",
- " name_seasons name_event_types name_facilities \n",
- "0 2018 spectacle vivant mucem \n",
- "1 2018 spectacle vivant mucem \n",
- "2 2018 spectacle vivant mucem \n",
- "3 2018 spectacle vivant mucem \n",
- "4 2018 spectacle vivant mucem "
+ " customer_id nb_campaigns nb_campaigns_opened time_to_open\n",
+ "0 2 4 0.0 NaT\n",
+ "1 3 222 124.0 1 days 00:28:30.169354838\n",
+ "2 4 7 7.0 1 days 04:31:01.428571428\n",
+ "3 5 4 0.0 NaT\n",
+ "4 6 20 0.0 NaT"
]
},
- "execution_count": 137,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df1_products_purchased.head()"
+ "df1_campaigns_kpi.head()"
]
},
{
@@ -2177,41 +1691,12 @@
"id": "d4dcfbe0-c6ce-497e-b75e-dc9e938801b2",
"metadata": {},
"source": [
- "### KPI tickets"
+ "## KPI tickets"
]
},
{
"cell_type": "code",
- "execution_count": 138,
- "id": "665a5925-9c0e-425a-8f11-c33a0a9ec444",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['ticket_id', 'product_id', 'is_from_subscription', 'supplier_name',\n",
- " 'type_of_ticket_name', 'children', 'purchase_date', 'customer_id',\n",
- " 'id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
- " 'products_group_id', 'product_pack_id', 'event_id',\n",
- " 'id_representation_cap', 'season_id', 'facility_id', 'event_type_id',\n",
- " 'event_type_key_id', 'facility_key_id', 'street_id', 'amount',\n",
- " 'is_full_price', 'name_categories', 'name_events', 'name_seasons',\n",
- " 'name_event_types', 'name_facilities'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 138,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df1_products_purchased.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 139,
+ "execution_count": 28,
"id": "b913a69e-3146-4919-b5f6-a6108532bffa",
"metadata": {},
"outputs": [
@@ -2222,29 +1707,110 @@
" 'offre muséale groupe'], dtype=object)"
]
},
- "execution_count": 139,
+ "execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df1_products_purchased['name_event_types'].unique()"
+ "df1_products_purchased_reduced['name_event_types'].unique()"
]
},
{
"cell_type": "code",
- "execution_count": 140,
- "id": "e01e8cf9-1187-4a4b-993d-b7b4321cd8f0",
+ "execution_count": 29,
+ "id": "2bda0b97-b28b-4070-a57d-aeab0e2f7dfe",
"metadata": {},
"outputs": [],
"source": [
- "df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'product_id', 'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]"
+ "# Nombre de client assistant à plus de 2 type d'événement\n",
+ "nb_event_types = df1_products_purchased_reduced[['customer_id', 'name_event_types']].groupby('customer_id').nunique()"
]
},
{
"cell_type": "code",
- "execution_count": 141,
- "id": "3d8b0875-b409-44ce-b688-d9d6758782d3",
+ "execution_count": 30,
+ "id": "043303fe-e90f-4689-a2a9-5d690555a045",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def tickets_kpi_function(tickets_information = None):\n",
+ "\n",
+ " tickets_information_copy = tickets_information.copy()\n",
+ "\n",
+ " # Dummy : Canal de vente en ligne\n",
+ " liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n",
+ " tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n",
+ "\n",
+ " # Proportion de vente en ligne\n",
+ " prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()\n",
+ " prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)\n",
+ " \n",
+ " tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]\n",
+ " .groupby(['customer_id', 'event_type_id']) \n",
+ " .agg({'ticket_id': 'count', \n",
+ " 'amount' : 'sum',\n",
+ " 'supplier_name': 'nunique',\n",
+ " 'vente_internet' : 'max',\n",
+ " 'purchase_date' : ['min', 'max']})\n",
+ " .reset_index()\n",
+ " )\n",
+ " \n",
+ " tickets_kpi.columns = tickets_kpi.columns.map('_'.join)\n",
+ " \n",
+ " tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', \n",
+ " 'amount_sum' : 'total_amount',\n",
+ " 'supplier_name_nunique' : 'nb_suppliers', \n",
+ " 'customer_id_' : 'customer_id',\n",
+ " 'event_type_id_' : 'event_type_id'}, inplace = True)\n",
+ " \n",
+ " tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n",
+ "\n",
+ " tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')\n",
+ " tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)\n",
+ " \n",
+ " return tickets_kpi\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "5882234a-1ed5-4269-87a6-0d75613476e3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "597b241e-a83d-4b7c-8ad7-eec50295dff2",
+ "metadata": {},
+ "source": [
+ "#### Exportation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "a4a2311d-8a72-4030-afd5-218004d5d2a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Exportation vers 'projet-bdc2324-team1'\n",
+ "BUCKET_OUT = \"projet-bdc2324-team1\"\n",
+ "FILE_KEY_OUT_S3 = \"0_Temp/Company 1 - Purchasing behaviour.csv\"\n",
+ "FILE_PATH_OUT_S3 = BUCKET_OUT + \"/\" + FILE_KEY_OUT_S3\n",
+ "\n",
+ "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
+ " df1_tickets_kpi.to_csv(file_out, index = False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad",
"metadata": {},
"outputs": [
{
@@ -2268,161 +1834,123 @@
" \n",
" \n",
" \n",
- " ticket_id \n",
" customer_id \n",
- " product_id \n",
" event_type_id \n",
- " supplier_name \n",
- " purchase_date \n",
- " type_of_ticket_name \n",
- " amount \n",
- " children \n",
- " is_full_price \n",
- " name_event_types \n",
- " name_facilities \n",
- " name_categories \n",
- " name_events \n",
- " name_seasons \n",
+ " nb_tickets \n",
+ " total_amount \n",
+ " nb_suppliers \n",
+ " vente_internet_max \n",
+ " purchase_date_min \n",
+ " purchase_date_max \n",
+ " time_between_purchase \n",
+ " nb_tickets_internet \n",
" \n",
" \n",
" \n",
" \n",
- " 0 \n",
- " 13070859 \n",
- " 48187 \n",
- " 225251 \n",
- " 4 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " Atelier \n",
- " 8.0 \n",
- " pricing_formula \n",
- " False \n",
- " spectacle vivant \n",
- " mucem \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
- " \n",
- " \n",
" 1 \n",
- " 13070855 \n",
- " 48187 \n",
- " 225251 \n",
+ " 1 \n",
" 4 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " Atelier \n",
- " 8.0 \n",
- " pricing_formula \n",
- " False \n",
- " spectacle vivant \n",
- " mucem \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
+ " 453242 \n",
+ " 3248965.5 \n",
+ " 6 \n",
+ " 1 \n",
+ " 2013-09-23 14:45:01+00:00 \n",
+ " 2023-11-03 14:11:01+00:00 \n",
+ " 3692 days 23:26:00 \n",
+ " 2988.0 \n",
" \n",
" \n",
- " 2 \n",
- " 13070856 \n",
- " 48187 \n",
- " 225251 \n",
- " 4 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " Atelier \n",
- " 8.0 \n",
- " pricing_formula \n",
- " False \n",
- " spectacle vivant \n",
- " mucem \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 384226 \n",
+ " 2686540.5 \n",
+ " 7 \n",
+ " 1 \n",
+ " 2014-12-03 14:55:37+00:00 \n",
+ " 2023-11-04 15:12:16+00:00 \n",
+ " 3258 days 00:16:39 \n",
+ " 51.0 \n",
" \n",
" \n",
" 3 \n",
- " 13070857 \n",
- " 48187 \n",
- " 225251 \n",
- " 4 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " Atelier \n",
- " 8.0 \n",
- " pricing_formula \n",
- " False \n",
- " spectacle vivant \n",
- " mucem \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
+ " 1 \n",
+ " 6 \n",
+ " 217356 \n",
+ " 1435871.5 \n",
+ " 5 \n",
+ " 1 \n",
+ " 2017-01-01 02:20:08+00:00 \n",
+ " 2019-12-31 02:20:06+00:00 \n",
+ " 1093 days 23:59:58 \n",
+ " 5.0 \n",
" \n",
" \n",
- " 4 \n",
- " 13070858 \n",
- " 48187 \n",
- " 225251 \n",
- " 4 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " Atelier \n",
- " 8.0 \n",
- " pricing_formula \n",
- " False \n",
- " spectacle vivant \n",
- " mucem \n",
- " indiv prog enfant \n",
- " l'école des magiciens \n",
- " 2018 \n",
+ " 2 \n",
+ " 1 \n",
+ " 5 \n",
+ " 201750 \n",
+ " 1459190.0 \n",
+ " 6 \n",
+ " 1 \n",
+ " 2013-06-10 10:37:58+00:00 \n",
+ " 2023-11-08 15:59:45+00:00 \n",
+ " 3803 days 05:21:47 \n",
+ " 9.0 \n",
+ " \n",
+ " \n",
+ " 5032 \n",
+ " 6733 \n",
+ " 6 \n",
+ " 14208 \n",
+ " 0.0 \n",
+ " 3 \n",
+ " 1 \n",
+ " 2017-01-11 15:00:54+00:00 \n",
+ " 2019-11-27 09:47:06+00:00 \n",
+ " 1049 days 18:46:12 \n",
+ " 13497.0 \n",
" \n",
" \n",
"\n",
""
],
"text/plain": [
- " ticket_id customer_id product_id event_type_id supplier_name \\\n",
- "0 13070859 48187 225251 4 vente en ligne \n",
- "1 13070855 48187 225251 4 vente en ligne \n",
- "2 13070856 48187 225251 4 vente en ligne \n",
- "3 13070857 48187 225251 4 vente en ligne \n",
- "4 13070858 48187 225251 4 vente en ligne \n",
+ " customer_id event_type_id nb_tickets total_amount nb_suppliers \\\n",
+ "1 1 4 453242 3248965.5 6 \n",
+ "0 1 2 384226 2686540.5 7 \n",
+ "3 1 6 217356 1435871.5 5 \n",
+ "2 1 5 201750 1459190.0 6 \n",
+ "5032 6733 6 14208 0.0 3 \n",
"\n",
- " purchase_date type_of_ticket_name amount children \\\n",
- "0 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n",
- "1 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n",
- "2 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n",
- "3 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n",
- "4 2018-12-28 14:47:50+00:00 Atelier 8.0 pricing_formula \n",
+ " vente_internet_max purchase_date_min purchase_date_max \\\n",
+ "1 1 2013-09-23 14:45:01+00:00 2023-11-03 14:11:01+00:00 \n",
+ "0 1 2014-12-03 14:55:37+00:00 2023-11-04 15:12:16+00:00 \n",
+ "3 1 2017-01-01 02:20:08+00:00 2019-12-31 02:20:06+00:00 \n",
+ "2 1 2013-06-10 10:37:58+00:00 2023-11-08 15:59:45+00:00 \n",
+ "5032 1 2017-01-11 15:00:54+00:00 2019-11-27 09:47:06+00:00 \n",
"\n",
- " is_full_price name_event_types name_facilities name_categories \\\n",
- "0 False spectacle vivant mucem indiv prog enfant \n",
- "1 False spectacle vivant mucem indiv prog enfant \n",
- "2 False spectacle vivant mucem indiv prog enfant \n",
- "3 False spectacle vivant mucem indiv prog enfant \n",
- "4 False spectacle vivant mucem indiv prog enfant \n",
- "\n",
- " name_events name_seasons \n",
- "0 l'école des magiciens 2018 \n",
- "1 l'école des magiciens 2018 \n",
- "2 l'école des magiciens 2018 \n",
- "3 l'école des magiciens 2018 \n",
- "4 l'école des magiciens 2018 "
+ " time_between_purchase nb_tickets_internet \n",
+ "1 3692 days 23:26:00 2988.0 \n",
+ "0 3258 days 00:16:39 51.0 \n",
+ "3 1093 days 23:59:58 5.0 \n",
+ "2 3803 days 05:21:47 9.0 \n",
+ "5032 1049 days 18:46:12 13497.0 "
]
},
- "execution_count": 141,
+ "execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# Importance des suppliers\n",
- "df1_products_purchased_reduced.head()"
+ "df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(5)"
]
},
{
"cell_type": "markdown",
- "id": "9354b283-9e00-4aa9-a017-d7dd11fdf745",
+ "id": "f1d7f7ba-361b-467d-b375-b09c149185f7",
"metadata": {},
"source": [
"## Alexis' work"
@@ -2430,8 +1958,8 @@
},
{
"cell_type": "code",
- "execution_count": 142,
- "id": "cfbeaf0b-64ea-4abf-b785-57e43e651108",
+ "execution_count": 34,
+ "id": "4ab1c0d2-0097-4669-b984-b6822c976740",
"metadata": {},
"outputs": [
{
@@ -2492,7 +2020,7 @@
"3 6 6.439463"
]
},
- "execution_count": 142,
+ "execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
@@ -2507,8 +2035,8 @@
},
{
"cell_type": "code",
- "execution_count": 143,
- "id": "0805e41f-bb43-46a2-ac65-1a379936b3d8",
+ "execution_count": 35,
+ "id": "a9c62b39-389e-4dac-89a6-ac8a59fea58a",
"metadata": {},
"outputs": [
{
@@ -2587,7 +2115,7 @@
"4 2 2 143 6.150659"
]
},
- "execution_count": 143,
+ "execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
@@ -2600,1123 +2128,10 @@
"nb_tickets.head()"
]
},
- {
- "cell_type": "code",
- "execution_count": 144,
- "id": "28fd3b8c-0caf-4d4e-9c39-9c1cd2bab126",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id \n",
- " birthdate \n",
- " street_id \n",
- " is_partner \n",
- " gender \n",
- " is_email_true \n",
- " opt_in \n",
- " structure_id \n",
- " profession \n",
- " language \n",
- " mcp_contact_id \n",
- " last_buying_date \n",
- " max_price \n",
- " ticket_sum \n",
- " average_price \n",
- " fidelity \n",
- " average_purchase_delay \n",
- " average_price_basket \n",
- " average_ticket_basket \n",
- " total_price \n",
- " purchase_count \n",
- " first_buying_date \n",
- " country \n",
- " age \n",
- " tenant_id \n",
- " nb_campaigns \n",
- " nb_campaigns_opened \n",
- " time_to_open \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 12751 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 1 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " \n",
- " \n",
- " 1 \n",
- " 12825 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " \n",
- " \n",
- " 2 \n",
- " 11261 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 1 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " \n",
- " \n",
- " 3 \n",
- " 13071 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " \n",
- " \n",
- " 4 \n",
- " 653061 \n",
- " NaN \n",
- " 10 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " 1311 \n",
- " 80.0 \n",
- " 2.0 \n",
- " 0 days 19:53:02.500000 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " customer_id birthdate street_id is_partner gender is_email_true \\\n",
- "0 12751 NaN 2 False 1 True \n",
- "1 12825 NaN 2 False 2 True \n",
- "2 11261 NaN 2 False 1 True \n",
- "3 13071 NaN 2 False 2 True \n",
- "4 653061 NaN 10 False 2 True \n",
- "\n",
- " opt_in structure_id profession language mcp_contact_id last_buying_date \\\n",
- "0 True NaN NaN NaN NaN NaN \n",
- "1 True NaN NaN NaN NaN NaN \n",
- "2 True NaN NaN NaN NaN NaN \n",
- "3 True NaN NaN NaN NaN NaN \n",
- "4 False NaN NaN NaN NaN NaN \n",
- "\n",
- " max_price ticket_sum average_price fidelity average_purchase_delay \\\n",
- "0 NaN 0 0.0 0 NaN \n",
- "1 NaN 0 0.0 0 NaN \n",
- "2 NaN 0 0.0 0 NaN \n",
- "3 NaN 0 0.0 0 NaN \n",
- "4 NaN 0 0.0 0 NaN \n",
- "\n",
- " average_price_basket average_ticket_basket total_price purchase_count \\\n",
- "0 NaN NaN NaN 0 \n",
- "1 NaN NaN NaN 0 \n",
- "2 NaN NaN NaN 0 \n",
- "3 NaN NaN NaN 0 \n",
- "4 NaN NaN NaN 0 \n",
- "\n",
- " first_buying_date country age tenant_id nb_campaigns \\\n",
- "0 NaT fr NaN 1311 NaN \n",
- "1 NaT fr NaN 1311 NaN \n",
- "2 NaT fr NaN 1311 NaN \n",
- "3 NaT fr NaN 1311 NaN \n",
- "4 NaT NaN NaN 1311 80.0 \n",
- "\n",
- " nb_campaigns_opened time_to_open \n",
- "0 NaN NaT \n",
- "1 NaN NaT \n",
- "2 NaN NaT \n",
- "3 NaN NaT \n",
- "4 2.0 0 days 19:53:02.500000 "
- ]
- },
- "execution_count": 144,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Fusion avec KPI campaigns liés au customer\n",
- "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
- "df1_customer.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 146,
- "id": "b438c563-e6c1-4b10-bedf-3b251f97018d",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "shape : (156289, 31)\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id \n",
- " birthdate \n",
- " street_id \n",
- " is_partner \n",
- " gender \n",
- " is_email_true \n",
- " opt_in \n",
- " structure_id \n",
- " profession \n",
- " language \n",
- " mcp_contact_id \n",
- " last_buying_date \n",
- " max_price \n",
- " ticket_sum \n",
- " average_price \n",
- " fidelity \n",
- " average_purchase_delay \n",
- " average_price_basket \n",
- " average_ticket_basket \n",
- " total_price \n",
- " purchase_count \n",
- " first_buying_date \n",
- " country \n",
- " age \n",
- " tenant_id \n",
- " nb_campaigns \n",
- " nb_campaigns_opened \n",
- " time_to_open \n",
- " event_type_id \n",
- " nb_tickets \n",
- " avg_amount \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 12751 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 1 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " 1 \n",
- " 12825 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " 2 \n",
- " 11261 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 1 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " 3 \n",
- " 13071 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- " 4 \n",
- " 653061 \n",
- " NaN \n",
- " 10 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " 1311 \n",
- " 80.0 \n",
- " 2.0 \n",
- " 0 days 19:53:02.500000 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " customer_id birthdate street_id is_partner gender is_email_true \\\n",
- "0 12751 NaN 2 False 1 True \n",
- "1 12825 NaN 2 False 2 True \n",
- "2 11261 NaN 2 False 1 True \n",
- "3 13071 NaN 2 False 2 True \n",
- "4 653061 NaN 10 False 2 True \n",
- "\n",
- " opt_in structure_id profession language mcp_contact_id last_buying_date \\\n",
- "0 True NaN NaN NaN NaN NaN \n",
- "1 True NaN NaN NaN NaN NaN \n",
- "2 True NaN NaN NaN NaN NaN \n",
- "3 True NaN NaN NaN NaN NaN \n",
- "4 False NaN NaN NaN NaN NaN \n",
- "\n",
- " max_price ticket_sum average_price fidelity average_purchase_delay \\\n",
- "0 NaN 0 0.0 0 NaN \n",
- "1 NaN 0 0.0 0 NaN \n",
- "2 NaN 0 0.0 0 NaN \n",
- "3 NaN 0 0.0 0 NaN \n",
- "4 NaN 0 0.0 0 NaN \n",
- "\n",
- " average_price_basket average_ticket_basket total_price purchase_count \\\n",
- "0 NaN NaN NaN 0 \n",
- "1 NaN NaN NaN 0 \n",
- "2 NaN NaN NaN 0 \n",
- "3 NaN NaN NaN 0 \n",
- "4 NaN NaN NaN 0 \n",
- "\n",
- " first_buying_date country age tenant_id nb_campaigns \\\n",
- "0 NaT fr NaN 1311 NaN \n",
- "1 NaT fr NaN 1311 NaN \n",
- "2 NaT fr NaN 1311 NaN \n",
- "3 NaT fr NaN 1311 NaN \n",
- "4 NaT NaN NaN 1311 80.0 \n",
- "\n",
- " nb_campaigns_opened time_to_open event_type_id nb_tickets \\\n",
- "0 NaN NaT NaN NaN \n",
- "1 NaN NaT NaN NaN \n",
- "2 NaN NaT NaN NaN \n",
- "3 NaN NaT NaN NaN \n",
- "4 2.0 0 days 19:53:02.500000 NaN NaN \n",
- "\n",
- " avg_amount \n",
- "0 NaN \n",
- "1 NaN \n",
- "2 NaN \n",
- "3 NaN \n",
- "4 NaN "
- ]
- },
- "execution_count": 146,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n",
- "print(\"shape : \", df1_customer_product.shape)\n",
- "df1_customer_product.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 147,
- "id": "afcfe12d-f840-4886-a08b-13a69f022f4c",
- "metadata": {},
- "outputs": [],
- "source": [
- "df1_customer_product.to_csv(\"customer_product.csv\", index = False)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "8e763591-1802-4f5b-8285-1cf980de541a",
- "metadata": {},
- "source": [
- "## End of Alexis' work"
- ]
- },
{
"cell_type": "code",
"execution_count": 36,
- "id": "2bda0b97-b28b-4070-a57d-aeab0e2f7dfe",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Nombre de client assistant à plus de 2 type d'événement\n",
- "nb_event_types = df1_products_purchased_reduced[['customer_id', 'name_event_types']].groupby('customer_id').nunique()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "id": "043303fe-e90f-4689-a2a9-5d690555a045",
- "metadata": {},
- "outputs": [],
- "source": [
- "def tickets_kpi_function(tickets_information = None):\n",
- " tickets_information_copy = tickets_information.copy()\n",
- " tickets_information_copy['purchase_date_max'] = tickets_information_copy['purchase_date']\n",
- " tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id', 'ticket_id','supplier_name', 'purchase_date', 'purchase_date_max', 'amount']]\n",
- " .groupby([ 'customer_id']) # 'event_type_id',\n",
- " .agg({'ticket_id': 'count', \n",
- " 'amount' : 'sum',\n",
- " 'supplier_name': 'nunique',\n",
- " 'purchase_date_max' : 'max',\n",
- " 'purchase_date' : 'min'})\n",
- " .reset_index()\n",
- " )\n",
- " \n",
- " tickets_kpi.rename(columns = {'ticket_id' : 'nb_tickets', \n",
- " 'amount' : 'total_amount',\n",
- " 'supplier_name' : 'nb_suppliers', \n",
- " 'purchase_date' : 'purchase_date_min'}, inplace = True)\n",
- " \n",
- " tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']\n",
- " \n",
- " return tickets_kpi\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "id": "5882234a-1ed5-4269-87a6-0d75613476e3",
- "metadata": {},
- "outputs": [],
- "source": [
- "df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "a7a452a6-cd5e-4c8b-b250-8a7d26e48fad",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id \n",
- " nb_tickets \n",
- " total_amount \n",
- " nb_suppliers \n",
- " purchase_date_max \n",
- " purchase_date_min \n",
- " time_between_purchase \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 1 \n",
- " 1256574 \n",
- " 8830567.5 \n",
- " 7 \n",
- " 2023-11-08 15:59:45+00:00 \n",
- " 2013-06-10 10:37:58+00:00 \n",
- " 3803 days 05:21:47 \n",
- " \n",
- " \n",
- " 3615 \n",
- " 6733 \n",
- " 35527 \n",
- " 1188.0 \n",
- " 4 \n",
- " 2023-11-03 09:42:40+00:00 \n",
- " 2015-09-09 13:48:38+00:00 \n",
- " 2976 days 19:54:02 \n",
- " \n",
- " \n",
- " 39 \n",
- " 41 \n",
- " 16263 \n",
- " 37642.0 \n",
- " 6 \n",
- " 2023-10-25 09:13:16+00:00 \n",
- " 2014-01-23 16:56:57+00:00 \n",
- " 3561 days 16:16:19 \n",
- " \n",
- " \n",
- " 11 \n",
- " 12 \n",
- " 5871 \n",
- " 38767.0 \n",
- " 2 \n",
- " 2023-11-04 13:46:59+00:00 \n",
- " 2018-04-04 07:46:31+00:00 \n",
- " 2040 days 06:00:28 \n",
- " \n",
- " \n",
- " 32809 \n",
- " 63488 \n",
- " 5851 \n",
- " 64350.0 \n",
- " 1 \n",
- " 2022-08-25 13:08:38+00:00 \n",
- " 2020-08-18 08:32:57+00:00 \n",
- " 737 days 04:35:41 \n",
- " \n",
- " \n",
- " 3708 \n",
- " 6916 \n",
- " 5482 \n",
- " 51489.5 \n",
- " 2 \n",
- " 2021-08-26 12:49:17+00:00 \n",
- " 2018-03-26 11:13:43+00:00 \n",
- " 1249 days 01:35:34 \n",
- " \n",
- " \n",
- " 32616 \n",
- " 63194 \n",
- " 4507 \n",
- " 13232.0 \n",
- " 3 \n",
- " 2022-09-07 12:55:33+00:00 \n",
- " 2017-11-28 13:52:15+00:00 \n",
- " 1743 days 23:03:18 \n",
- " \n",
- " \n",
- " 78 \n",
- " 81 \n",
- " 3562 \n",
- " 38746.0 \n",
- " 1 \n",
- " 2022-08-30 11:51:34+00:00 \n",
- " 2017-01-05 13:04:58+00:00 \n",
- " 2062 days 22:46:36 \n",
- " \n",
- " \n",
- " 35295 \n",
- " 84002 \n",
- " 3403 \n",
- " 19830.0 \n",
- " 4 \n",
- " 2023-11-06 15:59:22+00:00 \n",
- " 2021-05-28 10:22:33+00:00 \n",
- " 892 days 05:36:49 \n",
- " \n",
- " \n",
- " 3377 \n",
- " 5618 \n",
- " 3294 \n",
- " 31684.5 \n",
- " 1 \n",
- " 2022-02-24 07:47:20+00:00 \n",
- " 2018-10-25 11:04:24+00:00 \n",
- " 1217 days 20:42:56 \n",
- " \n",
- " \n",
- " 30011 \n",
- " 59259 \n",
- " 2591 \n",
- " 4350.0 \n",
- " 3 \n",
- " 2023-06-12 14:05:19+00:00 \n",
- " 2019-11-25 08:52:48+00:00 \n",
- " 1295 days 05:12:31 \n",
- " \n",
- " \n",
- " 34937 \n",
- " 74876 \n",
- " 2571 \n",
- " 2600.0 \n",
- " 2 \n",
- " 2023-10-02 08:13:05+00:00 \n",
- " 2018-02-08 12:54:01+00:00 \n",
- " 2061 days 19:19:04 \n",
- " \n",
- " \n",
- " 270 \n",
- " 295 \n",
- " 2570 \n",
- " 17678.5 \n",
- " 6 \n",
- " 2023-10-16 10:19:22+00:00 \n",
- " 2014-01-24 15:16:17+00:00 \n",
- " 3551 days 19:03:05 \n",
- " \n",
- " \n",
- " 866 \n",
- " 1221 \n",
- " 2320 \n",
- " 9652.0 \n",
- " 2 \n",
- " 2022-09-19 12:55:15+00:00 \n",
- " 2017-03-29 08:00:09+00:00 \n",
- " 2000 days 04:55:06 \n",
- " \n",
- " \n",
- " 1022 \n",
- " 1429 \n",
- " 2249 \n",
- " 3500.0 \n",
- " 4 \n",
- " 2023-11-06 08:30:37+00:00 \n",
- " 2014-12-03 14:56:38+00:00 \n",
- " 3259 days 17:33:59 \n",
- " \n",
- " \n",
- " 3922 \n",
- " 7249 \n",
- " 1827 \n",
- " 13385.0 \n",
- " 1 \n",
- " 2021-10-26 12:28:40+00:00 \n",
- " 2019-05-07 12:34:56+00:00 \n",
- " 902 days 23:53:44 \n",
- " \n",
- " \n",
- " 54425 \n",
- " 1070539 \n",
- " 1800 \n",
- " 19800.0 \n",
- " 1 \n",
- " 2022-07-25 12:49:27+00:00 \n",
- " 2022-05-02 16:09:03+00:00 \n",
- " 83 days 20:40:24 \n",
- " \n",
- " \n",
- " 69520 \n",
- " 1216801 \n",
- " 1623 \n",
- " 12562.0 \n",
- " 2 \n",
- " 2023-09-29 16:34:38+00:00 \n",
- " 2023-06-16 14:16:04+00:00 \n",
- " 105 days 02:18:34 \n",
- " \n",
- " \n",
- " 30056 \n",
- " 59330 \n",
- " 1551 \n",
- " 0.0 \n",
- " 1 \n",
- " 2023-11-06 10:22:14+00:00 \n",
- " 2018-02-02 08:53:51+00:00 \n",
- " 2103 days 01:28:23 \n",
- " \n",
- " \n",
- " 3243 \n",
- " 5441 \n",
- " 1544 \n",
- " 14133.0 \n",
- " 2 \n",
- " 2022-09-22 08:21:47+00:00 \n",
- " 2017-12-14 12:50:23+00:00 \n",
- " 1742 days 19:31:24 \n",
- " \n",
- " \n",
- " 55195 \n",
- " 1084435 \n",
- " 1500 \n",
- " 16500.0 \n",
- " 1 \n",
- " 2022-09-27 14:32:13+00:00 \n",
- " 2022-05-18 08:04:41+00:00 \n",
- " 132 days 06:27:32 \n",
- " \n",
- " \n",
- " 28983 \n",
- " 57816 \n",
- " 1485 \n",
- " 0.0 \n",
- " 2 \n",
- " 2023-05-22 07:30:55+00:00 \n",
- " 2019-01-21 14:19:18+00:00 \n",
- " 1581 days 17:11:37 \n",
- " \n",
- " \n",
- " 2231 \n",
- " 2942 \n",
- " 1307 \n",
- " 100.0 \n",
- " 2 \n",
- " 2023-06-29 09:33:58+00:00 \n",
- " 2017-10-25 15:06:58+00:00 \n",
- " 2072 days 18:27:00 \n",
- " \n",
- " \n",
- " 23 \n",
- " 24 \n",
- " 1266 \n",
- " 0.0 \n",
- " 2 \n",
- " 2023-10-19 07:20:48+00:00 \n",
- " 2015-09-30 16:07:52+00:00 \n",
- " 2940 days 15:12:56 \n",
- " \n",
- " \n",
- " 4513 \n",
- " 9592 \n",
- " 1211 \n",
- " 62.0 \n",
- " 4 \n",
- " 2023-10-17 09:39:40+00:00 \n",
- " 2018-02-25 07:17:19+00:00 \n",
- " 2060 days 02:22:21 \n",
- " \n",
- " \n",
- " 2936 \n",
- " 5059 \n",
- " 1186 \n",
- " 6308.0 \n",
- " 3 \n",
- " 2023-05-22 13:41:22+00:00 \n",
- " 2018-02-01 11:16:51+00:00 \n",
- " 1936 days 02:24:31 \n",
- " \n",
- " \n",
- " 11484 \n",
- " 25100 \n",
- " 1123 \n",
- " 0.0 \n",
- " 1 \n",
- " 2021-07-13 07:39:57+00:00 \n",
- " 2015-12-21 15:38:05+00:00 \n",
- " 2030 days 16:01:52 \n",
- " \n",
- " \n",
- " 934 \n",
- " 1326 \n",
- " 1098 \n",
- " 798.0 \n",
- " 3 \n",
- " 2023-02-01 08:39:45+00:00 \n",
- " 2018-02-13 13:13:48+00:00 \n",
- " 1813 days 19:25:57 \n",
- " \n",
- " \n",
- " 30156 \n",
- " 59490 \n",
- " 1088 \n",
- " 0.0 \n",
- " 1 \n",
- " 2023-10-05 08:23:50+00:00 \n",
- " 2019-12-06 12:59:20+00:00 \n",
- " 1398 days 19:24:30 \n",
- " \n",
- " \n",
- " 36478 \n",
- " 251268 \n",
- " 1086 \n",
- " 0.0 \n",
- " 2 \n",
- " 2023-06-30 07:22:46+00:00 \n",
- " 2018-02-02 09:06:22+00:00 \n",
- " 1973 days 22:16:24 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " customer_id nb_tickets total_amount nb_suppliers \\\n",
- "0 1 1256574 8830567.5 7 \n",
- "3615 6733 35527 1188.0 4 \n",
- "39 41 16263 37642.0 6 \n",
- "11 12 5871 38767.0 2 \n",
- "32809 63488 5851 64350.0 1 \n",
- "3708 6916 5482 51489.5 2 \n",
- "32616 63194 4507 13232.0 3 \n",
- "78 81 3562 38746.0 1 \n",
- "35295 84002 3403 19830.0 4 \n",
- "3377 5618 3294 31684.5 1 \n",
- "30011 59259 2591 4350.0 3 \n",
- "34937 74876 2571 2600.0 2 \n",
- "270 295 2570 17678.5 6 \n",
- "866 1221 2320 9652.0 2 \n",
- "1022 1429 2249 3500.0 4 \n",
- "3922 7249 1827 13385.0 1 \n",
- "54425 1070539 1800 19800.0 1 \n",
- "69520 1216801 1623 12562.0 2 \n",
- "30056 59330 1551 0.0 1 \n",
- "3243 5441 1544 14133.0 2 \n",
- "55195 1084435 1500 16500.0 1 \n",
- "28983 57816 1485 0.0 2 \n",
- "2231 2942 1307 100.0 2 \n",
- "23 24 1266 0.0 2 \n",
- "4513 9592 1211 62.0 4 \n",
- "2936 5059 1186 6308.0 3 \n",
- "11484 25100 1123 0.0 1 \n",
- "934 1326 1098 798.0 3 \n",
- "30156 59490 1088 0.0 1 \n",
- "36478 251268 1086 0.0 2 \n",
- "\n",
- " purchase_date_max purchase_date_min \\\n",
- "0 2023-11-08 15:59:45+00:00 2013-06-10 10:37:58+00:00 \n",
- "3615 2023-11-03 09:42:40+00:00 2015-09-09 13:48:38+00:00 \n",
- "39 2023-10-25 09:13:16+00:00 2014-01-23 16:56:57+00:00 \n",
- "11 2023-11-04 13:46:59+00:00 2018-04-04 07:46:31+00:00 \n",
- "32809 2022-08-25 13:08:38+00:00 2020-08-18 08:32:57+00:00 \n",
- "3708 2021-08-26 12:49:17+00:00 2018-03-26 11:13:43+00:00 \n",
- "32616 2022-09-07 12:55:33+00:00 2017-11-28 13:52:15+00:00 \n",
- "78 2022-08-30 11:51:34+00:00 2017-01-05 13:04:58+00:00 \n",
- "35295 2023-11-06 15:59:22+00:00 2021-05-28 10:22:33+00:00 \n",
- "3377 2022-02-24 07:47:20+00:00 2018-10-25 11:04:24+00:00 \n",
- "30011 2023-06-12 14:05:19+00:00 2019-11-25 08:52:48+00:00 \n",
- "34937 2023-10-02 08:13:05+00:00 2018-02-08 12:54:01+00:00 \n",
- "270 2023-10-16 10:19:22+00:00 2014-01-24 15:16:17+00:00 \n",
- "866 2022-09-19 12:55:15+00:00 2017-03-29 08:00:09+00:00 \n",
- "1022 2023-11-06 08:30:37+00:00 2014-12-03 14:56:38+00:00 \n",
- "3922 2021-10-26 12:28:40+00:00 2019-05-07 12:34:56+00:00 \n",
- "54425 2022-07-25 12:49:27+00:00 2022-05-02 16:09:03+00:00 \n",
- "69520 2023-09-29 16:34:38+00:00 2023-06-16 14:16:04+00:00 \n",
- "30056 2023-11-06 10:22:14+00:00 2018-02-02 08:53:51+00:00 \n",
- "3243 2022-09-22 08:21:47+00:00 2017-12-14 12:50:23+00:00 \n",
- "55195 2022-09-27 14:32:13+00:00 2022-05-18 08:04:41+00:00 \n",
- "28983 2023-05-22 07:30:55+00:00 2019-01-21 14:19:18+00:00 \n",
- "2231 2023-06-29 09:33:58+00:00 2017-10-25 15:06:58+00:00 \n",
- "23 2023-10-19 07:20:48+00:00 2015-09-30 16:07:52+00:00 \n",
- "4513 2023-10-17 09:39:40+00:00 2018-02-25 07:17:19+00:00 \n",
- "2936 2023-05-22 13:41:22+00:00 2018-02-01 11:16:51+00:00 \n",
- "11484 2021-07-13 07:39:57+00:00 2015-12-21 15:38:05+00:00 \n",
- "934 2023-02-01 08:39:45+00:00 2018-02-13 13:13:48+00:00 \n",
- "30156 2023-10-05 08:23:50+00:00 2019-12-06 12:59:20+00:00 \n",
- "36478 2023-06-30 07:22:46+00:00 2018-02-02 09:06:22+00:00 \n",
- "\n",
- " time_between_purchase \n",
- "0 3803 days 05:21:47 \n",
- "3615 2976 days 19:54:02 \n",
- "39 3561 days 16:16:19 \n",
- "11 2040 days 06:00:28 \n",
- "32809 737 days 04:35:41 \n",
- "3708 1249 days 01:35:34 \n",
- "32616 1743 days 23:03:18 \n",
- "78 2062 days 22:46:36 \n",
- "35295 892 days 05:36:49 \n",
- "3377 1217 days 20:42:56 \n",
- "30011 1295 days 05:12:31 \n",
- "34937 2061 days 19:19:04 \n",
- "270 3551 days 19:03:05 \n",
- "866 2000 days 04:55:06 \n",
- "1022 3259 days 17:33:59 \n",
- "3922 902 days 23:53:44 \n",
- "54425 83 days 20:40:24 \n",
- "69520 105 days 02:18:34 \n",
- "30056 2103 days 01:28:23 \n",
- "3243 1742 days 19:31:24 \n",
- "55195 132 days 06:27:32 \n",
- "28983 1581 days 17:11:37 \n",
- "2231 2072 days 18:27:00 \n",
- "23 2940 days 15:12:56 \n",
- "4513 2060 days 02:22:21 \n",
- "2936 1936 days 02:24:31 \n",
- "11484 2030 days 16:01:52 \n",
- "934 1813 days 19:25:57 \n",
- "30156 1398 days 19:24:30 \n",
- "36478 1973 days 22:16:24 "
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df1_tickets_kpi.sort_values(by='nb_tickets', ascending=False).head(30)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7",
- "metadata": {},
- "source": [
- "# Fusion des bases locales"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Fusion avec KPI liés au customer\n",
- "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "id": "9740d64a-e5eb-4967-a534-ca6177546465",
+ "id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2",
"metadata": {},
"outputs": [
{
@@ -3921,21 +2336,30 @@
"[5 rows x 28 columns]"
]
},
- "execution_count": 41,
+ "execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "# Fusion avec KPI campaigns liés au customer\n",
+ "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
"df1_customer.head()"
]
},
{
"cell_type": "code",
- "execution_count": 42,
- "id": "b5c4418c-ad2e-4bb9-bd5c-3b769e9c87d4",
+ "execution_count": 37,
+ "id": "a89fad43-ee68-4081-9384-3e9f08ec6a59",
"metadata": {},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "shape : (156289, 31)\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -3967,17 +2391,7 @@
" structure_id \n",
" profession \n",
" language \n",
- " mcp_contact_id \n",
- " last_buying_date \n",
- " max_price \n",
- " ticket_sum \n",
- " average_price \n",
- " fidelity \n",
- " average_purchase_delay \n",
- " average_price_basket \n",
- " average_ticket_basket \n",
- " total_price \n",
- " purchase_count \n",
+ " ... \n",
" first_buying_date \n",
" country \n",
" age \n",
@@ -3985,159 +2399,9 @@
" nb_campaigns \n",
" nb_campaigns_opened \n",
" time_to_open \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 58201 \n",
- " 1 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 2023-11-08 03:20:07 \n",
- " 45.0 \n",
- " 1254775 \n",
- " 7.030122 \n",
- " 330831 \n",
- " -67.790969 \n",
- " 13.75153 \n",
- " 1.956087 \n",
- " 8821221.5 \n",
- " 641472 \n",
- " 2013-06-10 10:37:58+00:00 \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " \n",
- " \n",
- "\n",
- ""
- ],
- "text/plain": [
- " customer_id birthdate street_id is_partner gender is_email_true \\\n",
- "58201 1 NaN 2 False 2 True \n",
- "\n",
- " opt_in structure_id profession language mcp_contact_id \\\n",
- "58201 False NaN NaN NaN NaN \n",
- "\n",
- " last_buying_date max_price ticket_sum average_price fidelity \\\n",
- "58201 2023-11-08 03:20:07 45.0 1254775 7.030122 330831 \n",
- "\n",
- " average_purchase_delay average_price_basket average_ticket_basket \\\n",
- "58201 -67.790969 13.75153 1.956087 \n",
- "\n",
- " total_price purchase_count first_buying_date country age \\\n",
- "58201 8821221.5 641472 2013-06-10 10:37:58+00:00 fr NaN \n",
- "\n",
- " tenant_id nb_campaigns nb_campaigns_opened time_to_open \n",
- "58201 1311 NaN NaN NaT "
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pd.set_option('display.max_columns', None)\n",
- "\n",
- "\n",
- "df1_customer[df1_customer['customer_id'] == 1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 43,
- "id": "2b161dfb-1593-4f1e-870b-de24735e4968",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id \n",
- " birthdate \n",
- " street_id_x \n",
- " is_partner \n",
- " gender \n",
- " is_email_true \n",
- " opt_in \n",
- " structure_id \n",
- " profession \n",
- " language \n",
- " mcp_contact_id \n",
- " last_buying_date \n",
- " max_price \n",
- " ticket_sum \n",
- " average_price \n",
- " fidelity \n",
- " average_purchase_delay \n",
- " average_price_basket \n",
- " average_ticket_basket \n",
- " total_price \n",
- " purchase_count \n",
- " first_buying_date \n",
- " country \n",
- " age \n",
- " tenant_id \n",
- " nb_campaigns \n",
- " nb_campaigns_opened \n",
- " time_to_open \n",
- " ticket_id \n",
- " product_id \n",
- " is_from_subscription \n",
- " supplier_name \n",
- " type_of_ticket_name \n",
- " children \n",
- " purchase_date \n",
- " id_products \n",
- " representation_id \n",
- " pricing_formula_id \n",
- " category_id \n",
- " products_group_id \n",
- " product_pack_id \n",
- " event_id \n",
- " id_representation_cap \n",
- " season_id \n",
- " facility_id \n",
" event_type_id \n",
- " event_type_key_id \n",
- " facility_key_id \n",
- " street_id_y \n",
- " amount \n",
- " is_full_price \n",
- " name_categories \n",
- " name_events \n",
- " name_seasons \n",
- " name_event_types \n",
- " name_facilities \n",
+ " nb_tickets \n",
+ " avg_amount \n",
" \n",
" \n",
" \n",
@@ -4153,17 +2417,7 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
+ " ... \n",
" NaT \n",
" fr \n",
" NaN \n",
@@ -4174,31 +2428,6 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
" \n",
" \n",
" 1 \n",
@@ -4212,17 +2441,7 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
+ " ... \n",
" NaT \n",
" fr \n",
" NaN \n",
@@ -4233,31 +2452,6 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
" \n",
" \n",
" 2 \n",
@@ -4271,17 +2465,7 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
+ " ... \n",
" NaT \n",
" fr \n",
" NaN \n",
@@ -4292,31 +2476,6 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
" \n",
" \n",
" 3 \n",
@@ -4330,17 +2489,7 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
+ " ... \n",
" NaT \n",
" fr \n",
" NaN \n",
@@ -4351,31 +2500,6 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
" \n",
" \n",
" 4 \n",
@@ -4389,17 +2513,7 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " 0.0 \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
+ " ... \n",
" NaT \n",
" NaN \n",
" NaN \n",
@@ -4410,152 +2524,103 @@
" NaN \n",
" NaN \n",
" NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
" \n",
" \n",
"
\n",
+ "
5 rows × 31 columns
\n",
"
"
],
"text/plain": [
- " customer_id birthdate street_id_x is_partner gender is_email_true \\\n",
- "0 12751 NaN 2 False 1 True \n",
- "1 12825 NaN 2 False 2 True \n",
- "2 11261 NaN 2 False 1 True \n",
- "3 13071 NaN 2 False 2 True \n",
- "4 653061 NaN 10 False 2 True \n",
+ " customer_id birthdate street_id is_partner gender is_email_true \\\n",
+ "0 12751 NaN 2 False 1 True \n",
+ "1 12825 NaN 2 False 2 True \n",
+ "2 11261 NaN 2 False 1 True \n",
+ "3 13071 NaN 2 False 2 True \n",
+ "4 653061 NaN 10 False 2 True \n",
"\n",
- " opt_in structure_id profession language mcp_contact_id last_buying_date \\\n",
- "0 True NaN NaN NaN NaN NaN \n",
- "1 True NaN NaN NaN NaN NaN \n",
- "2 True NaN NaN NaN NaN NaN \n",
- "3 True NaN NaN NaN NaN NaN \n",
- "4 False NaN NaN NaN NaN NaN \n",
+ " opt_in structure_id profession language ... first_buying_date country \\\n",
+ "0 True NaN NaN NaN ... NaT fr \n",
+ "1 True NaN NaN NaN ... NaT fr \n",
+ "2 True NaN NaN NaN ... NaT fr \n",
+ "3 True NaN NaN NaN ... NaT fr \n",
+ "4 False NaN NaN NaN ... NaT NaN \n",
"\n",
- " max_price ticket_sum average_price fidelity average_purchase_delay \\\n",
- "0 NaN 0 0.0 0 NaN \n",
- "1 NaN 0 0.0 0 NaN \n",
- "2 NaN 0 0.0 0 NaN \n",
- "3 NaN 0 0.0 0 NaN \n",
- "4 NaN 0 0.0 0 NaN \n",
+ " age tenant_id nb_campaigns nb_campaigns_opened time_to_open \\\n",
+ "0 NaN 1311 NaN NaN NaT \n",
+ "1 NaN 1311 NaN NaN NaT \n",
+ "2 NaN 1311 NaN NaN NaT \n",
+ "3 NaN 1311 NaN NaN NaT \n",
+ "4 NaN 1311 80.0 2.0 0 days 19:53:02.500000 \n",
"\n",
- " average_price_basket average_ticket_basket total_price purchase_count \\\n",
- "0 NaN NaN NaN 0 \n",
- "1 NaN NaN NaN 0 \n",
- "2 NaN NaN NaN 0 \n",
- "3 NaN NaN NaN 0 \n",
- "4 NaN NaN NaN 0 \n",
+ " event_type_id nb_tickets avg_amount \n",
+ "0 NaN NaN NaN \n",
+ "1 NaN NaN NaN \n",
+ "2 NaN NaN NaN \n",
+ "3 NaN NaN NaN \n",
+ "4 NaN NaN NaN \n",
"\n",
- " first_buying_date country age tenant_id nb_campaigns \\\n",
- "0 NaT fr NaN 1311 NaN \n",
- "1 NaT fr NaN 1311 NaN \n",
- "2 NaT fr NaN 1311 NaN \n",
- "3 NaT fr NaN 1311 NaN \n",
- "4 NaT NaN NaN 1311 80.0 \n",
- "\n",
- " nb_campaigns_opened time_to_open ticket_id product_id \\\n",
- "0 NaN NaT NaN NaN \n",
- "1 NaN NaT NaN NaN \n",
- "2 NaN NaT NaN NaN \n",
- "3 NaN NaT NaN NaN \n",
- "4 2.0 0 days 19:53:02.500000 NaN NaN \n",
- "\n",
- " is_from_subscription supplier_name type_of_ticket_name children \\\n",
- "0 NaN NaN NaN NaN \n",
- "1 NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- "\n",
- " purchase_date id_products representation_id pricing_formula_id \\\n",
- "0 NaT NaN NaN NaN \n",
- "1 NaT NaN NaN NaN \n",
- "2 NaT NaN NaN NaN \n",
- "3 NaT NaN NaN NaN \n",
- "4 NaT NaN NaN NaN \n",
- "\n",
- " category_id products_group_id product_pack_id event_id \\\n",
- "0 NaN NaN NaN NaN \n",
- "1 NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- "\n",
- " id_representation_cap season_id facility_id event_type_id \\\n",
- "0 NaN NaN NaN NaN \n",
- "1 NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- "\n",
- " event_type_key_id facility_key_id street_id_y amount is_full_price \\\n",
- "0 NaN NaN NaN NaN NaN \n",
- "1 NaN NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN \n",
- "\n",
- " name_categories name_events name_seasons name_event_types name_facilities \n",
- "0 NaN NaN NaN NaN NaN \n",
- "1 NaN NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN "
+ "[5 rows x 31 columns]"
]
},
- "execution_count": 43,
+ "execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "# Fusion avec KPI liés au comportement d'achat,\n",
- "df1_customer_product = pd.merge(df1_customer, df1_products_purchased, on = 'customer_id', how = 'left')\n",
+ "df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n",
+ "print(\"shape : \", df1_customer_product.shape)\n",
"df1_customer_product.head()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 38,
+ "id": "a19fec00-4ece-400c-937c-ce5cd8daccfd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# df1_customer_product.to_csv(\"customer_product.csv\", index = False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7c3211a5-a851-43bc-a1f0-b39d51857fb7",
+ "metadata": {},
+ "source": [
+ "# Fusion des bases locales"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fusion avec KPI liés au customer\n",
+ "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
"id": "1e42a790-b215-4107-a969-85005da06ebd",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec KPI liés au comportement d'achat\n",
- "#df1_customer_product = pd.merge(df1_products_purchased_reduced, df1_products_purchased, on = 'customer_id', how = 'outer')"
+ "df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer')"
]
},
{
"cell_type": "code",
- "execution_count": 42,
+ "execution_count": 41,
"id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f",
"metadata": {},
"outputs": [],
"source": [
- "#df1_customer_product.head()"
+ "# df1_customer_product"
]
}
],
diff --git a/1_Descriptive_Statistics.ipynb b/1_Descriptive_Statistics.ipynb
new file mode 100644
index 0000000..113fd77
--- /dev/null
+++ b/1_Descriptive_Statistics.ipynb
@@ -0,0 +1,543 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "3f41343f-7205-41d9-89dd-88039e301413",
+ "metadata": {},
+ "source": [
+ "# Statistiques descriptives"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "abfaf341-7b35-4407-9133-d21336c04027",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import s3fs\n",
+ "import re\n",
+ "import matplotlib.pyplot as plt"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "7fb72fa3-7940-496f-ac78-c2837f65eefa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create filesystem object\n",
+ "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
+ "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "45d5261f-4d46-49cb-8582-dd2121122b05",
+ "metadata": {},
+ "source": [
+ "# 1 - Comportement d'achat"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "9376af51-4320-44b6-8f30-1e1234371556",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Chargement des données temporaires\n",
+ "BUCKET = \"projet-bdc2324-team1\"\n",
+ "FILE_KEY_S3 = \"0_Temp/Company 1 - Purchasing behaviour.csv\"\n",
+ "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " tickets_kpi = pd.read_csv(file_in, sep=\",\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "1855dcca-cfce-4c54-90ae-55d9a1ab5d45",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " customer_id \n",
+ " event_type_id \n",
+ " nb_tickets \n",
+ " total_amount \n",
+ " nb_suppliers \n",
+ " vente_internet_max \n",
+ " purchase_date_min \n",
+ " purchase_date_max \n",
+ " time_between_purchase \n",
+ " nb_tickets_internet \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 384226 \n",
+ " 2686540.5 \n",
+ " 7 \n",
+ " 1 \n",
+ " 2014-12-03 14:55:37+00:00 \n",
+ " 2023-11-04 15:12:16+00:00 \n",
+ " 3258 days 00:16:39 \n",
+ " 51.0 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1 \n",
+ " 4 \n",
+ " 453242 \n",
+ " 3248965.5 \n",
+ " 6 \n",
+ " 1 \n",
+ " 2013-09-23 14:45:01+00:00 \n",
+ " 2023-11-03 14:11:01+00:00 \n",
+ " 3692 days 23:26:00 \n",
+ " 2988.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 1 \n",
+ " 5 \n",
+ " 201750 \n",
+ " 1459190.0 \n",
+ " 6 \n",
+ " 1 \n",
+ " 2013-06-10 10:37:58+00:00 \n",
+ " 2023-11-08 15:59:45+00:00 \n",
+ " 3803 days 05:21:47 \n",
+ " 9.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 1 \n",
+ " 6 \n",
+ " 217356 \n",
+ " 1435871.5 \n",
+ " 5 \n",
+ " 1 \n",
+ " 2017-01-01 02:20:08+00:00 \n",
+ " 2019-12-31 02:20:06+00:00 \n",
+ " 1093 days 23:59:58 \n",
+ " 5.0 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 2 \n",
+ " 2 \n",
+ " 143 \n",
+ " 0.0 \n",
+ " 1 \n",
+ " 0 \n",
+ " 2018-04-07 12:55:07+00:00 \n",
+ " 2020-03-08 12:06:43+00:00 \n",
+ " 700 days 23:11:36 \n",
+ " 0.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer_id event_type_id nb_tickets total_amount nb_suppliers \\\n",
+ "0 1 2 384226 2686540.5 7 \n",
+ "1 1 4 453242 3248965.5 6 \n",
+ "2 1 5 201750 1459190.0 6 \n",
+ "3 1 6 217356 1435871.5 5 \n",
+ "4 2 2 143 0.0 1 \n",
+ "\n",
+ " vente_internet_max purchase_date_min purchase_date_max \\\n",
+ "0 1 2014-12-03 14:55:37+00:00 2023-11-04 15:12:16+00:00 \n",
+ "1 1 2013-09-23 14:45:01+00:00 2023-11-03 14:11:01+00:00 \n",
+ "2 1 2013-06-10 10:37:58+00:00 2023-11-08 15:59:45+00:00 \n",
+ "3 1 2017-01-01 02:20:08+00:00 2019-12-31 02:20:06+00:00 \n",
+ "4 0 2018-04-07 12:55:07+00:00 2020-03-08 12:06:43+00:00 \n",
+ "\n",
+ " time_between_purchase nb_tickets_internet \n",
+ "0 3258 days 00:16:39 51.0 \n",
+ "1 3692 days 23:26:00 2988.0 \n",
+ "2 3803 days 05:21:47 9.0 \n",
+ "3 1093 days 23:59:58 5.0 \n",
+ "4 700 days 23:11:36 0.0 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tickets_kpi.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "0e5d3b2e-1a75-4d46-80e6-c306e9f8de84",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['customer_id', 'event_type_id', 'nb_tickets', 'total_amount',\n",
+ " 'nb_suppliers', 'vente_internet_max', 'purchase_date_min',\n",
+ " 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tickets_kpi.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "7667e8eb-9a1e-4216-96f4-bf987c6e30b5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " customer_id \n",
+ " event_type_id \n",
+ " nb_tickets \n",
+ " total_amount \n",
+ " nb_suppliers \n",
+ " vente_internet_max \n",
+ " purchase_date_min \n",
+ " purchase_date_max \n",
+ " time_between_purchase \n",
+ " nb_tickets_internet \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 1 \n",
+ " 4 \n",
+ " 453242 \n",
+ " 3248965.5 \n",
+ " 6 \n",
+ " 1 \n",
+ " 2013-09-23 14:45:01+00:00 \n",
+ " 2023-11-03 14:11:01+00:00 \n",
+ " 3692 days 23:26:00 \n",
+ " 2988.0 \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 2 \n",
+ " 384226 \n",
+ " 2686540.5 \n",
+ " 7 \n",
+ " 1 \n",
+ " 2014-12-03 14:55:37+00:00 \n",
+ " 2023-11-04 15:12:16+00:00 \n",
+ " 3258 days 00:16:39 \n",
+ " 51.0 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 1 \n",
+ " 6 \n",
+ " 217356 \n",
+ " 1435871.5 \n",
+ " 5 \n",
+ " 1 \n",
+ " 2017-01-01 02:20:08+00:00 \n",
+ " 2019-12-31 02:20:06+00:00 \n",
+ " 1093 days 23:59:58 \n",
+ " 5.0 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 1 \n",
+ " 5 \n",
+ " 201750 \n",
+ " 1459190.0 \n",
+ " 6 \n",
+ " 1 \n",
+ " 2013-06-10 10:37:58+00:00 \n",
+ " 2023-11-08 15:59:45+00:00 \n",
+ " 3803 days 05:21:47 \n",
+ " 9.0 \n",
+ " \n",
+ " \n",
+ " 5032 \n",
+ " 6733 \n",
+ " 6 \n",
+ " 14208 \n",
+ " 0.0 \n",
+ " 3 \n",
+ " 1 \n",
+ " 2017-01-11 15:00:54+00:00 \n",
+ " 2019-11-27 09:47:06+00:00 \n",
+ " 1049 days 18:46:12 \n",
+ " 13497.0 \n",
+ " \n",
+ " \n",
+ " 5029 \n",
+ " 6733 \n",
+ " 2 \n",
+ " 11656 \n",
+ " 471.0 \n",
+ " 3 \n",
+ " 1 \n",
+ " 2015-09-09 13:48:38+00:00 \n",
+ " 2022-07-07 07:37:12+00:00 \n",
+ " 2492 days 17:48:34 \n",
+ " 9815.0 \n",
+ " \n",
+ " \n",
+ " 5030 \n",
+ " 6733 \n",
+ " 4 \n",
+ " 7440 \n",
+ " 0.0 \n",
+ " 2 \n",
+ " 1 \n",
+ " 2021-01-06 10:05:01+00:00 \n",
+ " 2022-09-08 14:39:40+00:00 \n",
+ " 610 days 04:34:39 \n",
+ " 7419.0 \n",
+ " \n",
+ " \n",
+ " 60 \n",
+ " 41 \n",
+ " 6 \n",
+ " 6583 \n",
+ " 12546.5 \n",
+ " 4 \n",
+ " 1 \n",
+ " 2017-01-02 11:23:53+00:00 \n",
+ " 2019-12-30 10:36:55+00:00 \n",
+ " 1091 days 23:13:02 \n",
+ " 6391.0 \n",
+ " \n",
+ " \n",
+ " 57 \n",
+ " 41 \n",
+ " 2 \n",
+ " 6514 \n",
+ " 22423.0 \n",
+ " 6 \n",
+ " 1 \n",
+ " 2014-01-23 16:56:57+00:00 \n",
+ " 2023-03-06 13:55:23+00:00 \n",
+ " 3328 days 20:58:26 \n",
+ " 5321.0 \n",
+ " \n",
+ " \n",
+ " 36376 \n",
+ " 63488 \n",
+ " 4 \n",
+ " 5750 \n",
+ " 63250.0 \n",
+ " 1 \n",
+ " 1 \n",
+ " 2021-06-04 12:20:39+00:00 \n",
+ " 2022-08-25 13:08:38+00:00 \n",
+ " 447 days 00:47:59 \n",
+ " 5750.0 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer_id event_type_id nb_tickets total_amount nb_suppliers \\\n",
+ "1 1 4 453242 3248965.5 6 \n",
+ "0 1 2 384226 2686540.5 7 \n",
+ "3 1 6 217356 1435871.5 5 \n",
+ "2 1 5 201750 1459190.0 6 \n",
+ "5032 6733 6 14208 0.0 3 \n",
+ "5029 6733 2 11656 471.0 3 \n",
+ "5030 6733 4 7440 0.0 2 \n",
+ "60 41 6 6583 12546.5 4 \n",
+ "57 41 2 6514 22423.0 6 \n",
+ "36376 63488 4 5750 63250.0 1 \n",
+ "\n",
+ " vente_internet_max purchase_date_min \\\n",
+ "1 1 2013-09-23 14:45:01+00:00 \n",
+ "0 1 2014-12-03 14:55:37+00:00 \n",
+ "3 1 2017-01-01 02:20:08+00:00 \n",
+ "2 1 2013-06-10 10:37:58+00:00 \n",
+ "5032 1 2017-01-11 15:00:54+00:00 \n",
+ "5029 1 2015-09-09 13:48:38+00:00 \n",
+ "5030 1 2021-01-06 10:05:01+00:00 \n",
+ "60 1 2017-01-02 11:23:53+00:00 \n",
+ "57 1 2014-01-23 16:56:57+00:00 \n",
+ "36376 1 2021-06-04 12:20:39+00:00 \n",
+ "\n",
+ " purchase_date_max time_between_purchase nb_tickets_internet \n",
+ "1 2023-11-03 14:11:01+00:00 3692 days 23:26:00 2988.0 \n",
+ "0 2023-11-04 15:12:16+00:00 3258 days 00:16:39 51.0 \n",
+ "3 2019-12-31 02:20:06+00:00 1093 days 23:59:58 5.0 \n",
+ "2 2023-11-08 15:59:45+00:00 3803 days 05:21:47 9.0 \n",
+ "5032 2019-11-27 09:47:06+00:00 1049 days 18:46:12 13497.0 \n",
+ "5029 2022-07-07 07:37:12+00:00 2492 days 17:48:34 9815.0 \n",
+ "5030 2022-09-08 14:39:40+00:00 610 days 04:34:39 7419.0 \n",
+ "60 2019-12-30 10:36:55+00:00 1091 days 23:13:02 6391.0 \n",
+ "57 2023-03-06 13:55:23+00:00 3328 days 20:58:26 5321.0 \n",
+ "36376 2022-08-25 13:08:38+00:00 447 days 00:47:59 5750.0 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Présence d'outlier\n",
+ "tickets_kpi.sort_values(by = ['nb_tickets'], axis = 0, ascending = False).head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "9b2e27f2-703d-465b-a0f9-76e996de617c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Part du CA par customer\n",
+ "total_amount_share = tickets_kpi.groupby('customer_id')['total_amount'].sum().reset_index()\n",
+ "total_amount_share['total_amount_entreprise'] = total_amount_share['total_amount'].sum()\n",
+ "total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['total_amount_entreprise']\n",
+ "\n",
+ "total_amount_share_index = total_amount_share.set_index('customer_id')\n",
+ "df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "36141803-8865-4210-bd39-0a980301fd0c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Costumer 1 vs others customers\n",
+ "coupure = 1\n",
+ "\n",
+ "top = df_circulaire[:coupure]\n",
+ "rest = df_circulaire[coupure:]\n",
+ "\n",
+ "# Calculez la somme du reste\n",
+ "rest_sum = rest.sum()\n",
+ "\n",
+ "# Créez une nouvelle série avec les cinq plus grandes parts et 'Autre'\n",
+ "new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])\n",
+ "\n",
+ "# Créez le graphique circulaire\n",
+ "plt.figure(figsize=(3, 3))\n",
+ "plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)\n",
+ "plt.axis('equal') # Assurez-vous que le graphique est un cercle\n",
+ "plt.title('Répartition des montants totaux')\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "94cf1a25-9ded-48f2-b1b2-75225bdaf49d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tickets_kpi_filtered = tickets_kpi[tickets_kpi['customer_id'] != 1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "31e4e6f1-efc4-410d-b1d3-bb49950ef58e",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb
index 344dd7b..bec456e 100644
--- a/Exploration_billet_AJ.ipynb
+++ b/Exploration_billet_AJ.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "markdown",
- "id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
+ "id": "5bf5c226",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
@@ -11,7 +11,7 @@
{
"cell_type": "code",
"execution_count": 1,
- "id": "15103481-8d74-404c-aa09-7601fe7730da",
+ "id": "b1a5b9d3",
"metadata": {},
"outputs": [],
"source": [
@@ -24,7 +24,7 @@
},
{
"cell_type": "markdown",
- "id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
+ "id": "ecfa2219",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
@@ -33,7 +33,7 @@
{
"cell_type": "code",
"execution_count": 2,
- "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
+ "id": "1a094277",
"metadata": {},
"outputs": [],
"source": [
@@ -44,7 +44,7 @@
},
{
"cell_type": "markdown",
- "id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
+ "id": "c437eaec",
"metadata": {},
"source": [
"# Exemple sur Company 1"
@@ -52,7 +52,7 @@
},
{
"cell_type": "markdown",
- "id": "9d74b68f-ba07-4a15-9a27-dae931762d70",
+ "id": "a1c1fc39",
"metadata": {},
"source": [
"## Chargement données"
@@ -61,7 +61,7 @@
{
"cell_type": "code",
"execution_count": 3,
- "id": "699664b9-eee4-4f8d-a207-e524526560c5",
+ "id": "66f8c17b",
"metadata": {},
"outputs": [],
"source": [
@@ -69,68 +69,12 @@
"liste_database = fs.ls(BUCKET)"
]
},
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "aaf64d60-bf92-470c-8210-d09abd6a653e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['bdc2324-data/1/1campaign_stats.csv',\n",
- " 'bdc2324-data/1/1campaigns.csv',\n",
- " 'bdc2324-data/1/1categories.csv',\n",
- " 'bdc2324-data/1/1countries.csv',\n",
- " 'bdc2324-data/1/1currencies.csv',\n",
- " 'bdc2324-data/1/1customer_target_mappings.csv',\n",
- " 'bdc2324-data/1/1customersplus.csv',\n",
- " 'bdc2324-data/1/1event_types.csv',\n",
- " 'bdc2324-data/1/1events.csv',\n",
- " 'bdc2324-data/1/1facilities.csv',\n",
- " 'bdc2324-data/1/1link_stats.csv',\n",
- " 'bdc2324-data/1/1pricing_formulas.csv',\n",
- " 'bdc2324-data/1/1product_packs.csv',\n",
- " 'bdc2324-data/1/1products.csv',\n",
- " 'bdc2324-data/1/1products_groups.csv',\n",
- " 'bdc2324-data/1/1purchases.csv',\n",
- " 'bdc2324-data/1/1representation_category_capacities.csv',\n",
- " 'bdc2324-data/1/1representations.csv',\n",
- " 'bdc2324-data/1/1seasons.csv',\n",
- " 'bdc2324-data/1/1structure_tag_mappings.csv',\n",
- " 'bdc2324-data/1/1suppliers.csv',\n",
- " 'bdc2324-data/1/1tags.csv',\n",
- " 'bdc2324-data/1/1target_types.csv',\n",
- " 'bdc2324-data/1/1targets.csv',\n",
- " 'bdc2324-data/1/1tickets.csv',\n",
- " 'bdc2324-data/1/1type_of_categories.csv',\n",
- " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n",
- " 'bdc2324-data/1/1type_ofs.csv']"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "liste_database"
- ]
- },
{
"cell_type": "code",
"execution_count": 5,
- "id": "0cb92854-903b-4efd-ac1b-197e29f044b4",
+ "id": "c08e6798",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1tickets.csv', 'bdc2324-data/1/1type_ofs.csv']\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
"\n",
@@ -144,15 +88,20 @@
{
"cell_type": "code",
"execution_count": 6,
- "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
+ "id": "675f518d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
+ "<<<<<<< local \n",
+ "/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " df = pd.read_csv(file_in)\n",
+ "=======\n",
"/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
- " df = pd.read_csv(file_in)\n"
+ " df = pd.read_csv(file_in)\n",
+ ">>>>>>> remote \n"
]
}
],
@@ -174,8 +123,10 @@
},
{
"cell_type": "markdown",
- "id": "f01e4530-1a61-49cb-a6b0-aa188cf1c0e0",
- "metadata": {},
+ "id": "e855f403",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
"source": [
"## customersplus.csv"
]
@@ -183,52 +134,9 @@
{
"cell_type": "code",
"execution_count": 22,
- "id": "a01f993a-0f9f-4aed-bd23-bcdec9041bb3",
+ "id": "91a8f8c4",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 151866 entries, 0 to 151865\n",
- "Data columns (total 29 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 151866 non-null int64 \n",
- " 1 birthdate 5437 non-null object \n",
- " 2 street_id 151866 non-null int64 \n",
- " 3 civility 0 non-null float64\n",
- " 4 is_partner 151866 non-null bool \n",
- " 5 deleted_at 0 non-null float64\n",
- " 6 gender 151866 non-null int64 \n",
- " 7 is_email_true 151866 non-null bool \n",
- " 8 opt_in 151866 non-null bool \n",
- " 9 structure_id 18114 non-null float64\n",
- " 10 note 906 non-null object \n",
- " 11 profession 6206 non-null object \n",
- " 12 language 1092 non-null object \n",
- " 13 mcp_contact_id 98901 non-null float64\n",
- " 14 last_buying_date 73422 non-null object \n",
- " 15 max_price 73422 non-null float64\n",
- " 16 ticket_sum 151866 non-null int64 \n",
- " 17 average_price 138746 non-null float64\n",
- " 18 fidelity 151866 non-null int64 \n",
- " 19 average_purchase_delay 73422 non-null float64\n",
- " 20 average_price_basket 73422 non-null float64\n",
- " 21 average_ticket_basket 73422 non-null float64\n",
- " 22 total_price 86542 non-null float64\n",
- " 23 purchase_count 151866 non-null int64 \n",
- " 24 first_buying_date 73422 non-null object \n",
- " 25 last_visiting_date 0 non-null float64\n",
- " 26 country 143575 non-null object \n",
- " 27 age 5437 non-null float64\n",
- " 28 tenant_id 151866 non-null int64 \n",
- "dtypes: bool(3), float64(12), int64(7), object(7)\n",
- "memory usage: 30.6+ MB\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.info())"
]
@@ -236,7 +144,7 @@
{
"cell_type": "code",
"execution_count": 31,
- "id": "45e82fc0-ba17-497b-9818-8be2bdc49d22",
+ "id": "2fda171d",
"metadata": {},
"outputs": [],
"source": [
@@ -265,7 +173,7 @@
{
"cell_type": "code",
"execution_count": 35,
- "id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
+ "id": "205eeeab",
"metadata": {},
"outputs": [],
"source": [
@@ -290,7 +198,7 @@
{
"cell_type": "code",
"execution_count": 32,
- "id": "4bcdb081-c34f-4d51-b93f-abbb6fa49c5e",
+ "id": "634282c5",
"metadata": {},
"outputs": [],
"source": [
@@ -300,350 +208,9 @@
{
"cell_type": "code",
"execution_count": 33,
- "id": "319c814f-0956-4a92-9c0a-c6b9f53b04b5",
+ "id": "0e8d4133",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Nom_colonne \n",
- " Type_colonne \n",
- " Taux_NA \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " id \n",
- " int64 \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 1 \n",
- " lastname \n",
- " object \n",
- " 43.461341 \n",
- " \n",
- " \n",
- " 2 \n",
- " firstname \n",
- " object \n",
- " 44.995588 \n",
- " \n",
- " \n",
- " 3 \n",
- " birthdate \n",
- " object \n",
- " 96.419870 \n",
- " \n",
- " \n",
- " 4 \n",
- " email \n",
- " object \n",
- " 8.622075 \n",
- " \n",
- " \n",
- " 5 \n",
- " street_id \n",
- " int64 \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 6 \n",
- " created_at \n",
- " object \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 7 \n",
- " updated_at \n",
- " object \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 8 \n",
- " civility \n",
- " float64 \n",
- " 100.000000 \n",
- " \n",
- " \n",
- " 9 \n",
- " is_partner \n",
- " bool \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 10 \n",
- " extra \n",
- " float64 \n",
- " 100.000000 \n",
- " \n",
- " \n",
- " 11 \n",
- " deleted_at \n",
- " float64 \n",
- " 100.000000 \n",
- " \n",
- " \n",
- " 12 \n",
- " reference \n",
- " float64 \n",
- " 100.000000 \n",
- " \n",
- " \n",
- " 13 \n",
- " gender \n",
- " int64 \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 14 \n",
- " is_email_true \n",
- " bool \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 15 \n",
- " extra_field \n",
- " float64 \n",
- " 100.000000 \n",
- " \n",
- " \n",
- " 16 \n",
- " identifier \n",
- " object \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 17 \n",
- " opt_in \n",
- " bool \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 18 \n",
- " structure_id \n",
- " float64 \n",
- " 88.072380 \n",
- " \n",
- " \n",
- " 19 \n",
- " note \n",
- " object \n",
- " 99.403421 \n",
- " \n",
- " \n",
- " 20 \n",
- " profession \n",
- " object \n",
- " 95.913503 \n",
- " \n",
- " \n",
- " 21 \n",
- " language \n",
- " object \n",
- " 99.280945 \n",
- " \n",
- " \n",
- " 22 \n",
- " mcp_contact_id \n",
- " float64 \n",
- " 34.876141 \n",
- " \n",
- " \n",
- " 23 \n",
- " need_reload \n",
- " bool \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 24 \n",
- " last_buying_date \n",
- " object \n",
- " 51.653431 \n",
- " \n",
- " \n",
- " 25 \n",
- " max_price \n",
- " float64 \n",
- " 51.653431 \n",
- " \n",
- " \n",
- " 26 \n",
- " ticket_sum \n",
- " int64 \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 27 \n",
- " average_price \n",
- " float64 \n",
- " 8.639195 \n",
- " \n",
- " \n",
- " 28 \n",
- " fidelity \n",
- " int64 \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 29 \n",
- " average_purchase_delay \n",
- " float64 \n",
- " 51.653431 \n",
- " \n",
- " \n",
- " 30 \n",
- " average_price_basket \n",
- " float64 \n",
- " 51.653431 \n",
- " \n",
- " \n",
- " 31 \n",
- " average_ticket_basket \n",
- " float64 \n",
- " 51.653431 \n",
- " \n",
- " \n",
- " 32 \n",
- " total_price \n",
- " float64 \n",
- " 43.014236 \n",
- " \n",
- " \n",
- " 33 \n",
- " preferred_category \n",
- " float64 \n",
- " 100.000000 \n",
- " \n",
- " \n",
- " 34 \n",
- " preferred_supplier \n",
- " float64 \n",
- " 100.000000 \n",
- " \n",
- " \n",
- " 35 \n",
- " preferred_formula \n",
- " float64 \n",
- " 100.000000 \n",
- " \n",
- " \n",
- " 36 \n",
- " purchase_count \n",
- " int64 \n",
- " 0.000000 \n",
- " \n",
- " \n",
- " 37 \n",
- " first_buying_date \n",
- " object \n",
- " 51.653431 \n",
- " \n",
- " \n",
- " 38 \n",
- " last_visiting_date \n",
- " float64 \n",
- " 100.000000 \n",
- " \n",
- " \n",
- " 39 \n",
- " zipcode \n",
- " object \n",
- " 71.176564 \n",
- " \n",
- " \n",
- " 40 \n",
- " country \n",
- " object \n",
- " 5.459418 \n",
- " \n",
- " \n",
- " 41 \n",
- " age \n",
- " float64 \n",
- " 96.419870 \n",
- " \n",
- " \n",
- " 42 \n",
- " tenant_id \n",
- " int64 \n",
- " 0.000000 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Nom_colonne Type_colonne Taux_NA\n",
- "0 id int64 0.000000\n",
- "1 lastname object 43.461341\n",
- "2 firstname object 44.995588\n",
- "3 birthdate object 96.419870\n",
- "4 email object 8.622075\n",
- "5 street_id int64 0.000000\n",
- "6 created_at object 0.000000\n",
- "7 updated_at object 0.000000\n",
- "8 civility float64 100.000000\n",
- "9 is_partner bool 0.000000\n",
- "10 extra float64 100.000000\n",
- "11 deleted_at float64 100.000000\n",
- "12 reference float64 100.000000\n",
- "13 gender int64 0.000000\n",
- "14 is_email_true bool 0.000000\n",
- "15 extra_field float64 100.000000\n",
- "16 identifier object 0.000000\n",
- "17 opt_in bool 0.000000\n",
- "18 structure_id float64 88.072380\n",
- "19 note object 99.403421\n",
- "20 profession object 95.913503\n",
- "21 language object 99.280945\n",
- "22 mcp_contact_id float64 34.876141\n",
- "23 need_reload bool 0.000000\n",
- "24 last_buying_date object 51.653431\n",
- "25 max_price float64 51.653431\n",
- "26 ticket_sum int64 0.000000\n",
- "27 average_price float64 8.639195\n",
- "28 fidelity int64 0.000000\n",
- "29 average_purchase_delay float64 51.653431\n",
- "30 average_price_basket float64 51.653431\n",
- "31 average_ticket_basket float64 51.653431\n",
- "32 total_price float64 43.014236\n",
- "33 preferred_category float64 100.000000\n",
- "34 preferred_supplier float64 100.000000\n",
- "35 preferred_formula float64 100.000000\n",
- "36 purchase_count int64 0.000000\n",
- "37 first_buying_date object 51.653431\n",
- "38 last_visiting_date float64 100.000000\n",
- "39 zipcode object 71.176564\n",
- "40 country object 5.459418\n",
- "41 age float64 96.419870\n",
- "42 tenant_id int64 0.000000"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"a"
]
@@ -651,7 +218,7 @@
{
"cell_type": "code",
"execution_count": 16,
- "id": "e54a1170-2b10-4b22-8241-e7f5ec3fce75",
+ "id": "1268ad5a",
"metadata": {},
"outputs": [],
"source": [
@@ -661,216 +228,9 @@
{
"cell_type": "code",
"execution_count": 40,
- "id": "5c997ff6-251b-4e7f-8946-a8b722f5e97f",
+ "id": "bd41dc80",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " customer_id \n",
- " birthdate \n",
- " street_id \n",
- " is_partner \n",
- " gender \n",
- " is_email_true \n",
- " opt_in \n",
- " structure_id \n",
- " note \n",
- " profession \n",
- " ... \n",
- " fidelity \n",
- " average_purchase_delay \n",
- " average_price_basket \n",
- " average_ticket_basket \n",
- " total_price \n",
- " purchase_count \n",
- " first_buying_date \n",
- " country \n",
- " age \n",
- " tenant_id \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 12751 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 1 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " \n",
- " \n",
- " 1 \n",
- " 12825 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " \n",
- " \n",
- " 2 \n",
- " 11261 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 1 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " \n",
- " \n",
- " 3 \n",
- " 13071 \n",
- " NaN \n",
- " 2 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " True \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " fr \n",
- " NaN \n",
- " 1311 \n",
- " \n",
- " \n",
- " 4 \n",
- " 653061 \n",
- " NaN \n",
- " 10 \n",
- " False \n",
- " 2 \n",
- " True \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " ... \n",
- " 0 \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " NaN \n",
- " 0 \n",
- " NaT \n",
- " NaN \n",
- " NaN \n",
- " 1311 \n",
- " \n",
- " \n",
- "
\n",
- "
5 rows × 26 columns
\n",
- "
"
- ],
- "text/plain": [
- " customer_id birthdate street_id is_partner gender is_email_true \\\n",
- "0 12751 NaN 2 False 1 True \n",
- "1 12825 NaN 2 False 2 True \n",
- "2 11261 NaN 2 False 1 True \n",
- "3 13071 NaN 2 False 2 True \n",
- "4 653061 NaN 10 False 2 True \n",
- "\n",
- " opt_in structure_id note profession ... fidelity average_purchase_delay \\\n",
- "0 True NaN NaN NaN ... 0 NaN \n",
- "1 True NaN NaN NaN ... 0 NaN \n",
- "2 True NaN NaN NaN ... 0 NaN \n",
- "3 True NaN NaN NaN ... 0 NaN \n",
- "4 False NaN NaN NaN ... 0 NaN \n",
- "\n",
- " average_price_basket average_ticket_basket total_price purchase_count \\\n",
- "0 NaN NaN NaN 0 \n",
- "1 NaN NaN NaN 0 \n",
- "2 NaN NaN NaN 0 \n",
- "3 NaN NaN NaN 0 \n",
- "4 NaN NaN NaN 0 \n",
- "\n",
- " first_buying_date country age tenant_id \n",
- "0 NaT fr NaN 1311 \n",
- "1 NaT fr NaN 1311 \n",
- "2 NaT fr NaN 1311 \n",
- "3 NaT fr NaN 1311 \n",
- "4 NaT NaN NaN 1311 \n",
- "\n",
- "[5 rows x 26 columns]"
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"# Selection des variables\n",
"df1_customersplus_clean = df1_customersplus.copy()\n",
@@ -885,7 +245,7 @@
},
{
"cell_type": "markdown",
- "id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f",
+ "id": "64d0f76b",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
@@ -896,264 +256,9 @@
{
"cell_type": "code",
"execution_count": 6,
- "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4",
+ "id": "7e683711",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " number \n",
- " created_at \n",
- " updated_at \n",
- " purchase_id \n",
- " product_id \n",
- " is_from_subscription \n",
- " type_of \n",
- " supplier_id \n",
- " barcode \n",
- " identifier \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 13070859 \n",
- " 13593002661288 \n",
- " 2021-12-28 20:47:10.320641+01:00 \n",
- " 2022-02-14 18:46:53.614229+01:00 \n",
- " 5107462 \n",
- " 225251 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " b6ad7fc36f33b5e05f58c7fca06688a6 \n",
- " \n",
- " \n",
- " 1 \n",
- " 13070860 \n",
- " 13593002661399 \n",
- " 2021-12-28 20:47:10.321037+01:00 \n",
- " 2022-02-14 18:46:53.614761+01:00 \n",
- " 5107462 \n",
- " 224914 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " b0903af480266f27802fe5c38c277c9e \n",
- " \n",
- " \n",
- " 2 \n",
- " 13070861 \n",
- " 13593002661419 \n",
- " 2021-12-28 20:47:10.321629+01:00 \n",
- " 2022-02-14 18:46:53.615521+01:00 \n",
- " 5107462 \n",
- " 224914 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " 64ca12b7e26a65b90335c0702ea0faba \n",
- " \n",
- " \n",
- " 3 \n",
- " 13070862 \n",
- " 13593002661508 \n",
- " 2021-12-28 20:47:10.322029+01:00 \n",
- " 2022-02-14 18:46:53.616000+01:00 \n",
- " 5107462 \n",
- " 224914 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " 5ac2f8150aa9f3a6b1599df08cc2f0c7 \n",
- " \n",
- " \n",
- " 4 \n",
- " 13070863 \n",
- " 13593002661689 \n",
- " 2021-12-28 20:47:10.322449+01:00 \n",
- " 2022-02-14 18:46:53.616447+01:00 \n",
- " 5107462 \n",
- " 224914 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " dfe30081bae020d12094279926136b9c \n",
- " \n",
- " \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " \n",
- " \n",
- " 1826667 \n",
- " 20662815 \n",
- " 13593016154390 \n",
- " 2023-11-09 07:51:34.935983+01:00 \n",
- " 2023-11-09 07:51:34.935983+01:00 \n",
- " 8007697 \n",
- " 405689 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " dba9aa428f843b79ae69dfacfe8fc579 \n",
- " \n",
- " \n",
- " 1826668 \n",
- " 20662816 \n",
- " 13593016154501 \n",
- " 2023-11-09 07:51:34.937038+01:00 \n",
- " 2023-11-09 07:51:34.937038+01:00 \n",
- " 8007698 \n",
- " 403658 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " 93f1fcfc6ba4fa68f92eb4b4a619fcf0 \n",
- " \n",
- " \n",
- " 1826669 \n",
- " 20662817 \n",
- " 13593016154680 \n",
- " 2023-11-09 07:51:34.938224+01:00 \n",
- " 2023-11-09 07:51:34.938224+01:00 \n",
- " 8007698 \n",
- " 403658 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " c8bbbd25df2c158767ceef42c3237f23 \n",
- " \n",
- " \n",
- " 1826670 \n",
- " 20662818 \n",
- " 13593016154899 \n",
- " 2023-11-09 07:51:34.939328+01:00 \n",
- " 2023-11-09 07:51:34.939328+01:00 \n",
- " 8007699 \n",
- " 403658 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " 738f0a8b5088b5056bc3b32eff2dca1f \n",
- " \n",
- " \n",
- " 1826671 \n",
- " 20662819 \n",
- " 13593016154988 \n",
- " 2023-11-09 07:51:34.940680+01:00 \n",
- " 2023-11-09 07:51:34.940680+01:00 \n",
- " 8007699 \n",
- " 403658 \n",
- " False \n",
- " 1 \n",
- " 3 \n",
- " NaN \n",
- " 4c5a6195434377380b4e6ae63b2e9cf6 \n",
- " \n",
- " \n",
- "
\n",
- "
1826672 rows × 11 columns
\n",
- "
"
- ],
- "text/plain": [
- " id number created_at \\\n",
- "0 13070859 13593002661288 2021-12-28 20:47:10.320641+01:00 \n",
- "1 13070860 13593002661399 2021-12-28 20:47:10.321037+01:00 \n",
- "2 13070861 13593002661419 2021-12-28 20:47:10.321629+01:00 \n",
- "3 13070862 13593002661508 2021-12-28 20:47:10.322029+01:00 \n",
- "4 13070863 13593002661689 2021-12-28 20:47:10.322449+01:00 \n",
- "... ... ... ... \n",
- "1826667 20662815 13593016154390 2023-11-09 07:51:34.935983+01:00 \n",
- "1826668 20662816 13593016154501 2023-11-09 07:51:34.937038+01:00 \n",
- "1826669 20662817 13593016154680 2023-11-09 07:51:34.938224+01:00 \n",
- "1826670 20662818 13593016154899 2023-11-09 07:51:34.939328+01:00 \n",
- "1826671 20662819 13593016154988 2023-11-09 07:51:34.940680+01:00 \n",
- "\n",
- " updated_at purchase_id product_id \\\n",
- "0 2022-02-14 18:46:53.614229+01:00 5107462 225251 \n",
- "1 2022-02-14 18:46:53.614761+01:00 5107462 224914 \n",
- "2 2022-02-14 18:46:53.615521+01:00 5107462 224914 \n",
- "3 2022-02-14 18:46:53.616000+01:00 5107462 224914 \n",
- "4 2022-02-14 18:46:53.616447+01:00 5107462 224914 \n",
- "... ... ... ... \n",
- "1826667 2023-11-09 07:51:34.935983+01:00 8007697 405689 \n",
- "1826668 2023-11-09 07:51:34.937038+01:00 8007698 403658 \n",
- "1826669 2023-11-09 07:51:34.938224+01:00 8007698 403658 \n",
- "1826670 2023-11-09 07:51:34.939328+01:00 8007699 403658 \n",
- "1826671 2023-11-09 07:51:34.940680+01:00 8007699 403658 \n",
- "\n",
- " is_from_subscription type_of supplier_id barcode \\\n",
- "0 False 1 3 NaN \n",
- "1 False 1 3 NaN \n",
- "2 False 1 3 NaN \n",
- "3 False 1 3 NaN \n",
- "4 False 1 3 NaN \n",
- "... ... ... ... ... \n",
- "1826667 False 1 3 NaN \n",
- "1826668 False 1 3 NaN \n",
- "1826669 False 1 3 NaN \n",
- "1826670 False 1 3 NaN \n",
- "1826671 False 1 3 NaN \n",
- "\n",
- " identifier \n",
- "0 b6ad7fc36f33b5e05f58c7fca06688a6 \n",
- "1 b0903af480266f27802fe5c38c277c9e \n",
- "2 64ca12b7e26a65b90335c0702ea0faba \n",
- "3 5ac2f8150aa9f3a6b1599df08cc2f0c7 \n",
- "4 dfe30081bae020d12094279926136b9c \n",
- "... ... \n",
- "1826667 dba9aa428f843b79ae69dfacfe8fc579 \n",
- "1826668 93f1fcfc6ba4fa68f92eb4b4a619fcf0 \n",
- "1826669 c8bbbd25df2c158767ceef42c3237f23 \n",
- "1826670 738f0a8b5088b5056bc3b32eff2dca1f \n",
- "1826671 4c5a6195434377380b4e6ae63b2e9cf6 \n",
- "\n",
- "[1826672 rows x 11 columns]"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_tickets"
]
@@ -1161,34 +266,9 @@
{
"cell_type": "code",
"execution_count": 7,
- "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618",
+ "id": "e7b9a52e",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 1826672 entries, 0 to 1826671\n",
- "Data columns (total 11 columns):\n",
- " # Column Dtype \n",
- "--- ------ ----- \n",
- " 0 id int64 \n",
- " 1 number object \n",
- " 2 created_at object \n",
- " 3 updated_at object \n",
- " 4 purchase_id int64 \n",
- " 5 product_id int64 \n",
- " 6 is_from_subscription bool \n",
- " 7 type_of int64 \n",
- " 8 supplier_id int64 \n",
- " 9 barcode float64\n",
- " 10 identifier object \n",
- "dtypes: bool(1), float64(1), int64(5), object(4)\n",
- "memory usage: 141.1+ MB\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_tickets.info()"
]
@@ -1196,31 +276,9 @@
{
"cell_type": "code",
"execution_count": 8,
- "id": "c1b42769-03c7-4785-92ce-5e1e6b41908d",
+ "id": "568280e8",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "id 0.0\n",
- "number 0.0\n",
- "created_at 0.0\n",
- "updated_at 0.0\n",
- "purchase_id 0.0\n",
- "product_id 0.0\n",
- "is_from_subscription 0.0\n",
- "type_of 0.0\n",
- "supplier_id 0.0\n",
- "barcode 100.0\n",
- "identifier 0.0\n",
- "dtype: float64"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_tickets.isna().sum()/len(df1_tickets)*100"
]
@@ -1228,21 +286,9 @@
{
"cell_type": "code",
"execution_count": 9,
- "id": "42896791-2d93-4725-a50b-6c7cbe535ec7",
+ "id": "29ecec90",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_619/232847087.py:3: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Selection des variables\n",
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
@@ -1251,8 +297,10 @@
},
{
"cell_type": "markdown",
- "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52",
- "metadata": {},
+ "id": "22bb5de4",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
"source": [
"## suppliers.csv"
]
@@ -1260,194 +308,9 @@
{
"cell_type": "code",
"execution_count": 10,
- "id": "2e0dada0-9457-484c-aa55-77e44613ecca",
+ "id": "6a9a91f4",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " name \n",
- " manually_added \n",
- " label \n",
- " itr \n",
- " updated_at \n",
- " created_at \n",
- " commission \n",
- " identifier \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 1617 \n",
- " j4 administration \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " 2021-07-29 09:21:37.325772+02:00 \n",
- " 2021-07-29 09:21:37.325772+02:00 \n",
- " NaN \n",
- " 5958b2a060ac3e31678b438892a1bd2e \n",
- " \n",
- " \n",
- " 1 \n",
- " 8 \n",
- " non défini \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " 2020-09-03 13:16:35.329062+02:00 \n",
- " 2020-09-03 13:16:35.329062+02:00 \n",
- " NaN \n",
- " 52ff3466787b4d538407372e5f7afe0f \n",
- " \n",
- " \n",
- " 2 \n",
- " 4 \n",
- " vad \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " 2020-09-03 13:11:23.896992+02:00 \n",
- " 2020-09-03 13:11:23.896992+02:00 \n",
- " NaN \n",
- " 1225483c97b36018cab2bea14ab78ea6 \n",
- " \n",
- " \n",
- " 3 \n",
- " 1 \n",
- " fort saint jean \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " 2020-09-03 13:11:23.833073+02:00 \n",
- " 2020-09-03 13:11:23.833073+02:00 \n",
- " NaN \n",
- " 001b9b4a524fe407150b8235b304d4ec \n",
- " \n",
- " \n",
- " 4 \n",
- " 2 \n",
- " j4 \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " 2020-09-03 13:11:23.888993+02:00 \n",
- " 2020-09-03 13:11:23.888993+02:00 \n",
- " NaN \n",
- " 6a0cf6edf20060344b465706b61719aa \n",
- " \n",
- " \n",
- " 5 \n",
- " 5 \n",
- " revendeur \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " 2020-09-03 13:11:23.900987+02:00 \n",
- " 2020-09-03 13:11:23.900987+02:00 \n",
- " NaN \n",
- " 931239d4acb6214d7e5c98edecfb4916 \n",
- " \n",
- " \n",
- " 6 \n",
- " 3 \n",
- " vente en ligne \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " 2020-09-03 13:11:23.893097+02:00 \n",
- " 2020-09-03 13:11:23.893097+02:00 \n",
- " NaN \n",
- " bde8f2ccff510df8572d3214d86b837d \n",
- " \n",
- " \n",
- " 7 \n",
- " 6 \n",
- " ccr \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " 2020-09-03 13:11:23.904974+02:00 \n",
- " 2020-09-03 13:11:23.904974+02:00 \n",
- " NaN \n",
- " b48ec279411f7dbbb68393c61a9724d9 \n",
- " \n",
- " \n",
- " 8 \n",
- " 7 \n",
- " dab \n",
- " False \n",
- " NaN \n",
- " NaN \n",
- " 2020-09-03 13:11:23.908970+02:00 \n",
- " 2020-09-03 13:11:23.908970+02:00 \n",
- " NaN \n",
- " 11c6d471fa4e354e62e684d293694202 \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id name manually_added label itr \\\n",
- "0 1617 j4 administration False NaN NaN \n",
- "1 8 non défini False NaN NaN \n",
- "2 4 vad False NaN NaN \n",
- "3 1 fort saint jean False NaN NaN \n",
- "4 2 j4 False NaN NaN \n",
- "5 5 revendeur False NaN NaN \n",
- "6 3 vente en ligne False NaN NaN \n",
- "7 6 ccr False NaN NaN \n",
- "8 7 dab False NaN NaN \n",
- "\n",
- " updated_at created_at \\\n",
- "0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n",
- "1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n",
- "2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n",
- "3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n",
- "4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n",
- "5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n",
- "6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n",
- "7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n",
- "8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n",
- "\n",
- " commission identifier \n",
- "0 NaN 5958b2a060ac3e31678b438892a1bd2e \n",
- "1 NaN 52ff3466787b4d538407372e5f7afe0f \n",
- "2 NaN 1225483c97b36018cab2bea14ab78ea6 \n",
- "3 NaN 001b9b4a524fe407150b8235b304d4ec \n",
- "4 NaN 6a0cf6edf20060344b465706b61719aa \n",
- "5 NaN 931239d4acb6214d7e5c98edecfb4916 \n",
- "6 NaN bde8f2ccff510df8572d3214d86b837d \n",
- "7 NaN b48ec279411f7dbbb68393c61a9724d9 \n",
- "8 NaN 11c6d471fa4e354e62e684d293694202 "
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_suppliers"
]
@@ -1455,32 +318,9 @@
{
"cell_type": "code",
"execution_count": 11,
- "id": "b583be02-ab60-4e14-9325-0204f203a1af",
+ "id": "bab4758a",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 9 entries, 0 to 8\n",
- "Data columns (total 9 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 9 non-null int64 \n",
- " 1 name 9 non-null object \n",
- " 2 manually_added 9 non-null bool \n",
- " 3 label 0 non-null float64\n",
- " 4 itr 0 non-null float64\n",
- " 5 updated_at 9 non-null object \n",
- " 6 created_at 9 non-null object \n",
- " 7 commission 0 non-null float64\n",
- " 8 identifier 9 non-null object \n",
- "dtypes: bool(1), float64(3), int64(1), object(4)\n",
- "memory usage: 713.0+ bytes\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_suppliers.info()"
]
@@ -1488,29 +328,9 @@
{
"cell_type": "code",
"execution_count": 12,
- "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e",
+ "id": "b5fff251",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "id 0.0\n",
- "name 0.0\n",
- "manually_added 0.0\n",
- "label 100.0\n",
- "itr 100.0\n",
- "updated_at 0.0\n",
- "created_at 0.0\n",
- "commission 100.0\n",
- "identifier 0.0\n",
- "dtype: float64"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
]
@@ -1518,21 +338,9 @@
{
"cell_type": "code",
"execution_count": 13,
- "id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6",
+ "id": "8b09e2a3",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_619/302783287.py:3: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Selection des variables\n",
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
@@ -1542,109 +350,16 @@
{
"cell_type": "code",
"execution_count": 14,
- "id": "4de7e2e2-6da4-4618-8444-b524399c5493",
+ "id": "ecee7cdc",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " supplier_name \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 1617 \n",
- " j4 administration \n",
- " \n",
- " \n",
- " 1 \n",
- " 8 \n",
- " non défini \n",
- " \n",
- " \n",
- " 2 \n",
- " 4 \n",
- " vad \n",
- " \n",
- " \n",
- " 3 \n",
- " 1 \n",
- " fort saint jean \n",
- " \n",
- " \n",
- " 4 \n",
- " 2 \n",
- " j4 \n",
- " \n",
- " \n",
- " 5 \n",
- " 5 \n",
- " revendeur \n",
- " \n",
- " \n",
- " 6 \n",
- " 3 \n",
- " vente en ligne \n",
- " \n",
- " \n",
- " 7 \n",
- " 6 \n",
- " ccr \n",
- " \n",
- " \n",
- " 8 \n",
- " 7 \n",
- " dab \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id supplier_name\n",
- "0 1617 j4 administration\n",
- "1 8 non défini\n",
- "2 4 vad\n",
- "3 1 fort saint jean\n",
- "4 2 j4\n",
- "5 5 revendeur\n",
- "6 3 vente en ligne\n",
- "7 6 ccr\n",
- "8 7 dab"
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_suppliers_clean"
]
},
{
"cell_type": "markdown",
- "id": "0a6df975-c7fc-45bc-92af-a0bdab17d795",
+ "id": "c8e6e69b",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
@@ -1655,186 +370,9 @@
{
"cell_type": "code",
"execution_count": 15,
- "id": "a02f6594-3e91-4e87-bbb6-649c28d4f7e9",
+ "id": "1a6cff1f",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " name \n",
- " children \n",
- " created_at \n",
- " updated_at \n",
- " identifier \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 1 \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2021-01-05 11:55:51.188106+01:00 \n",
- " 2021-01-05 11:55:51.188106+01:00 \n",
- " 623ec4067827558b28972cf39fe81ee7 \n",
- " \n",
- " \n",
- " 1 \n",
- " 2 \n",
- " Billet en nombre \n",
- " pricing_formula \n",
- " 2021-01-11 12:13:19.286301+01:00 \n",
- " 2021-01-11 12:13:19.286301+01:00 \n",
- " a53d313a97296ee37caa066dbfe7a45c \n",
- " \n",
- " \n",
- " 2 \n",
- " 3 \n",
- " Groupe \n",
- " pricing_formula \n",
- " 2021-01-11 12:19:22.842917+01:00 \n",
- " 2021-01-11 12:19:22.842917+01:00 \n",
- " 1ab143efc3b85acbbc752fe8eb2b0b86 \n",
- " \n",
- " \n",
- " 3 \n",
- " 4 \n",
- " Revendeur \n",
- " pricing_formula \n",
- " 2021-01-12 12:34:20.481236+01:00 \n",
- " 2021-01-12 12:34:20.481236+01:00 \n",
- " 8b332723366a07e1eef5f1c92f9ae067 \n",
- " \n",
- " \n",
- " 4 \n",
- " 5 \n",
- " Cinéma scolaire \n",
- " pricing_formula \n",
- " 2021-01-25 19:16:05.141719+01:00 \n",
- " 2021-01-25 19:16:05.141719+01:00 \n",
- " a12e62cb4c4f47e7406bd8fbff2bfe30 \n",
- " \n",
- " \n",
- " 5 \n",
- " 6 \n",
- " Musée famille \n",
- " pricing_formula \n",
- " 2021-01-25 19:23:06.692627+01:00 \n",
- " 2021-01-25 19:23:06.692627+01:00 \n",
- " 1ec6c19283111ccb3ed67f52d414470e \n",
- " \n",
- " \n",
- " 6 \n",
- " 7 \n",
- " Spectacle famille \n",
- " pricing_formula \n",
- " 2021-01-25 19:28:21.390016+01:00 \n",
- " 2021-01-25 19:28:21.390016+01:00 \n",
- " 05e2104f1b74ced229c06847d6e91938 \n",
- " \n",
- " \n",
- " 7 \n",
- " 8 \n",
- " Masterclass \n",
- " pricing_formula \n",
- " 2021-01-25 19:31:05.076904+01:00 \n",
- " 2021-01-25 19:31:05.076904+01:00 \n",
- " 9cc946edfb25e11b4282f58db16e6ae9 \n",
- " \n",
- " \n",
- " 8 \n",
- " 9 \n",
- " Spectacle \n",
- " pricing_formula \n",
- " 2021-01-25 19:38:41.260535+01:00 \n",
- " 2021-01-25 19:38:41.260535+01:00 \n",
- " d88321c347f0e0ab101184cdf25c94bf \n",
- " \n",
- " \n",
- " 9 \n",
- " 10 \n",
- " Cinema \n",
- " pricing_formula \n",
- " 2021-02-05 11:12:31.932576+01:00 \n",
- " 2021-02-05 11:12:31.932576+01:00 \n",
- " 0870fef2bfcd5b30a12e4f5c7f4aaba7 \n",
- " \n",
- " \n",
- " 10 \n",
- " 11 \n",
- " Musee \n",
- " pricing_formula \n",
- " 2021-02-05 11:52:05.468207+01:00 \n",
- " 2021-02-05 11:52:05.468207+01:00 \n",
- " 8ba8934454cc62c7cdb3eb6e1b39df0c \n",
- " \n",
- " \n",
- " 11 \n",
- " 12 \n",
- " Tarifs plein \n",
- " category \n",
- " 2023-03-13 11:31:50.528331+01:00 \n",
- " 2023-03-13 11:31:50.528331+01:00 \n",
- " a6969df76efc15d157be48e87a7bcf9a \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id name children created_at \\\n",
- "0 1 Atelier pricing_formula 2021-01-05 11:55:51.188106+01:00 \n",
- "1 2 Billet en nombre pricing_formula 2021-01-11 12:13:19.286301+01:00 \n",
- "2 3 Groupe pricing_formula 2021-01-11 12:19:22.842917+01:00 \n",
- "3 4 Revendeur pricing_formula 2021-01-12 12:34:20.481236+01:00 \n",
- "4 5 Cinéma scolaire pricing_formula 2021-01-25 19:16:05.141719+01:00 \n",
- "5 6 Musée famille pricing_formula 2021-01-25 19:23:06.692627+01:00 \n",
- "6 7 Spectacle famille pricing_formula 2021-01-25 19:28:21.390016+01:00 \n",
- "7 8 Masterclass pricing_formula 2021-01-25 19:31:05.076904+01:00 \n",
- "8 9 Spectacle pricing_formula 2021-01-25 19:38:41.260535+01:00 \n",
- "9 10 Cinema pricing_formula 2021-02-05 11:12:31.932576+01:00 \n",
- "10 11 Musee pricing_formula 2021-02-05 11:52:05.468207+01:00 \n",
- "11 12 Tarifs plein category 2023-03-13 11:31:50.528331+01:00 \n",
- "\n",
- " updated_at identifier \n",
- "0 2021-01-05 11:55:51.188106+01:00 623ec4067827558b28972cf39fe81ee7 \n",
- "1 2021-01-11 12:13:19.286301+01:00 a53d313a97296ee37caa066dbfe7a45c \n",
- "2 2021-01-11 12:19:22.842917+01:00 1ab143efc3b85acbbc752fe8eb2b0b86 \n",
- "3 2021-01-12 12:34:20.481236+01:00 8b332723366a07e1eef5f1c92f9ae067 \n",
- "4 2021-01-25 19:16:05.141719+01:00 a12e62cb4c4f47e7406bd8fbff2bfe30 \n",
- "5 2021-01-25 19:23:06.692627+01:00 1ec6c19283111ccb3ed67f52d414470e \n",
- "6 2021-01-25 19:28:21.390016+01:00 05e2104f1b74ced229c06847d6e91938 \n",
- "7 2021-01-25 19:31:05.076904+01:00 9cc946edfb25e11b4282f58db16e6ae9 \n",
- "8 2021-01-25 19:38:41.260535+01:00 d88321c347f0e0ab101184cdf25c94bf \n",
- "9 2021-02-05 11:12:31.932576+01:00 0870fef2bfcd5b30a12e4f5c7f4aaba7 \n",
- "10 2021-02-05 11:52:05.468207+01:00 8ba8934454cc62c7cdb3eb6e1b39df0c \n",
- "11 2023-03-13 11:31:50.528331+01:00 a6969df76efc15d157be48e87a7bcf9a "
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_type_ofs"
]
@@ -1842,29 +380,9 @@
{
"cell_type": "code",
"execution_count": 16,
- "id": "e9c8d32b-22f4-4581-8af7-31cc1c31fa0e",
+ "id": "93630b41",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 12 entries, 0 to 11\n",
- "Data columns (total 6 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 12 non-null int64 \n",
- " 1 name 12 non-null object\n",
- " 2 children 12 non-null object\n",
- " 3 created_at 12 non-null object\n",
- " 4 updated_at 12 non-null object\n",
- " 5 identifier 12 non-null object\n",
- "dtypes: int64(1), object(5)\n",
- "memory usage: 704.0+ bytes\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_type_ofs.info()"
]
@@ -1872,21 +390,9 @@
{
"cell_type": "code",
"execution_count": 17,
- "id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da",
+ "id": "4f94481a",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_619/81842251.py:3: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Selection des variables\n",
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
@@ -1895,7 +401,7 @@
},
{
"cell_type": "markdown",
- "id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72",
+ "id": "1b2811e2",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
@@ -1906,205 +412,11 @@
{
"cell_type": "code",
"execution_count": 18,
- "id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c",
+ "id": "2455d2e1",
"metadata": {
"scrolled": true
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " purchase_date \n",
- " customer_id \n",
- " created_at \n",
- " updated_at \n",
- " number \n",
- " identifier \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 5145662 \n",
- " 2019-07-17 11:17:53+02:00 \n",
- " 6632 \n",
- " 2021-12-28 20:48:51.569237+01:00 \n",
- " 2021-12-28 20:48:51.569237+01:00 \n",
- " fa80c83b29a268b45728c910a8afcf79 \n",
- " 82877c41df26f832eb823a83acd1a172 \n",
- " \n",
- " \n",
- " 1 \n",
- " 4941642 \n",
- " 2018-10-31 11:59:00+01:00 \n",
- " 1 \n",
- " 2021-12-28 20:31:48.196681+01:00 \n",
- " 2022-03-03 17:52:21.958861+01:00 \n",
- " 597b6c06adfe6acc539b29b657b80da0 \n",
- " e7102ebe65526c427245533ebabe66e5 \n",
- " \n",
- " \n",
- " 2 \n",
- " 5088860 \n",
- " 2018-10-31 12:45:12+01:00 \n",
- " 1 \n",
- " 2021-12-28 20:46:34.703542+01:00 \n",
- " 2021-12-28 20:46:34.703542+01:00 \n",
- " 4a7f6baaf9be6a99e3fead7f7e981fa8 \n",
- " af75c4ae53d1b6957875538355b162e1 \n",
- " \n",
- " \n",
- " 3 \n",
- " 5088862 \n",
- " 2018-10-31 13:07:12+01:00 \n",
- " 1 \n",
- " 2021-12-28 20:46:34.704773+01:00 \n",
- " 2021-12-28 20:46:34.704773+01:00 \n",
- " 1d83dfad44b73070d1c6d5875d0edd2d \n",
- " 4b2fe34659b177209b07270ae1043b40 \n",
- " \n",
- " \n",
- " 4 \n",
- " 5088863 \n",
- " 2018-10-31 13:08:50+01:00 \n",
- " 1 \n",
- " 2021-12-28 20:46:34.705453+01:00 \n",
- " 2021-12-28 20:46:34.705453+01:00 \n",
- " 7bfe2bc9c1670c973d0960e3fd408cf8 \n",
- " b115f04a99b94df9e4a32185844f0998 \n",
- " \n",
- " \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " \n",
- " \n",
- " 742245 \n",
- " 8007695 \n",
- " 2023-11-08 17:51:19+01:00 \n",
- " 1256133 \n",
- " 2023-11-09 07:51:33.920187+01:00 \n",
- " 2023-11-09 07:51:33.920187+01:00 \n",
- " 99ad774dedbad43feb73514765d2f0ba \n",
- " d68558180b4bf2e8a945724843655775 \n",
- " \n",
- " \n",
- " 742246 \n",
- " 8007696 \n",
- " 2023-11-08 18:17:51+01:00 \n",
- " 1256134 \n",
- " 2023-11-09 07:51:33.921967+01:00 \n",
- " 2023-11-09 07:51:33.921967+01:00 \n",
- " c1511614c511c5f95980172690179102 \n",
- " f5102d910a7731091f239ad7b0df35b4 \n",
- " \n",
- " \n",
- " 742247 \n",
- " 8007697 \n",
- " 2023-11-08 18:23:54+01:00 \n",
- " 1256135 \n",
- " 2023-11-09 07:51:33.923034+01:00 \n",
- " 2023-11-09 07:51:33.923034+01:00 \n",
- " 33b64b39cc53428b4f17d65ff5b93104 \n",
- " e2b917626be60cc2c3207cc037fe69e4 \n",
- " \n",
- " \n",
- " 742248 \n",
- " 8007698 \n",
- " 2023-11-08 19:32:18+01:00 \n",
- " 1256136 \n",
- " 2023-11-09 07:51:33.924135+01:00 \n",
- " 2023-11-09 07:51:33.924135+01:00 \n",
- " 9ae0b129e704b3d9c093ce9c7c4e5039 \n",
- " 5bfa23236c31f8562c3a0233c1b53b31 \n",
- " \n",
- " \n",
- " 742249 \n",
- " 8007699 \n",
- " 2023-11-08 20:30:28+01:00 \n",
- " 1256137 \n",
- " 2023-11-09 07:51:33.925382+01:00 \n",
- " 2023-11-09 07:51:33.925382+01:00 \n",
- " d31ced089c2b1f90479257a4686f9306 \n",
- " d86b1e0de3ff01eaf04fbcd031ac5fef \n",
- " \n",
- " \n",
- "
\n",
- "
742250 rows × 7 columns
\n",
- "
"
- ],
- "text/plain": [
- " id purchase_date customer_id \\\n",
- "0 5145662 2019-07-17 11:17:53+02:00 6632 \n",
- "1 4941642 2018-10-31 11:59:00+01:00 1 \n",
- "2 5088860 2018-10-31 12:45:12+01:00 1 \n",
- "3 5088862 2018-10-31 13:07:12+01:00 1 \n",
- "4 5088863 2018-10-31 13:08:50+01:00 1 \n",
- "... ... ... ... \n",
- "742245 8007695 2023-11-08 17:51:19+01:00 1256133 \n",
- "742246 8007696 2023-11-08 18:17:51+01:00 1256134 \n",
- "742247 8007697 2023-11-08 18:23:54+01:00 1256135 \n",
- "742248 8007698 2023-11-08 19:32:18+01:00 1256136 \n",
- "742249 8007699 2023-11-08 20:30:28+01:00 1256137 \n",
- "\n",
- " created_at updated_at \\\n",
- "0 2021-12-28 20:48:51.569237+01:00 2021-12-28 20:48:51.569237+01:00 \n",
- "1 2021-12-28 20:31:48.196681+01:00 2022-03-03 17:52:21.958861+01:00 \n",
- "2 2021-12-28 20:46:34.703542+01:00 2021-12-28 20:46:34.703542+01:00 \n",
- "3 2021-12-28 20:46:34.704773+01:00 2021-12-28 20:46:34.704773+01:00 \n",
- "4 2021-12-28 20:46:34.705453+01:00 2021-12-28 20:46:34.705453+01:00 \n",
- "... ... ... \n",
- "742245 2023-11-09 07:51:33.920187+01:00 2023-11-09 07:51:33.920187+01:00 \n",
- "742246 2023-11-09 07:51:33.921967+01:00 2023-11-09 07:51:33.921967+01:00 \n",
- "742247 2023-11-09 07:51:33.923034+01:00 2023-11-09 07:51:33.923034+01:00 \n",
- "742248 2023-11-09 07:51:33.924135+01:00 2023-11-09 07:51:33.924135+01:00 \n",
- "742249 2023-11-09 07:51:33.925382+01:00 2023-11-09 07:51:33.925382+01:00 \n",
- "\n",
- " number identifier \n",
- "0 fa80c83b29a268b45728c910a8afcf79 82877c41df26f832eb823a83acd1a172 \n",
- "1 597b6c06adfe6acc539b29b657b80da0 e7102ebe65526c427245533ebabe66e5 \n",
- "2 4a7f6baaf9be6a99e3fead7f7e981fa8 af75c4ae53d1b6957875538355b162e1 \n",
- "3 1d83dfad44b73070d1c6d5875d0edd2d 4b2fe34659b177209b07270ae1043b40 \n",
- "4 7bfe2bc9c1670c973d0960e3fd408cf8 b115f04a99b94df9e4a32185844f0998 \n",
- "... ... ... \n",
- "742245 99ad774dedbad43feb73514765d2f0ba d68558180b4bf2e8a945724843655775 \n",
- "742246 c1511614c511c5f95980172690179102 f5102d910a7731091f239ad7b0df35b4 \n",
- "742247 33b64b39cc53428b4f17d65ff5b93104 e2b917626be60cc2c3207cc037fe69e4 \n",
- "742248 9ae0b129e704b3d9c093ce9c7c4e5039 5bfa23236c31f8562c3a0233c1b53b31 \n",
- "742249 d31ced089c2b1f90479257a4686f9306 d86b1e0de3ff01eaf04fbcd031ac5fef \n",
- "\n",
- "[742250 rows x 7 columns]"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_purchases"
]
@@ -2112,30 +424,9 @@
{
"cell_type": "code",
"execution_count": 19,
- "id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8",
+ "id": "5f9a159d",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 742250 entries, 0 to 742249\n",
- "Data columns (total 7 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 742250 non-null int64 \n",
- " 1 purchase_date 742250 non-null object\n",
- " 2 customer_id 742250 non-null int64 \n",
- " 3 created_at 742250 non-null object\n",
- " 4 updated_at 742250 non-null object\n",
- " 5 number 742250 non-null object\n",
- " 6 identifier 742250 non-null object\n",
- "dtypes: int64(2), object(5)\n",
- "memory usage: 39.6+ MB\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_purchases.info()"
]
@@ -2143,7 +434,7 @@
{
"cell_type": "code",
"execution_count": 20,
- "id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd",
+ "id": "db201bf7",
"metadata": {},
"outputs": [],
"source": [
@@ -2155,30 +446,9 @@
{
"cell_type": "code",
"execution_count": 21,
- "id": "27d18584-228f-4698-85d6-4d23151ea5ed",
+ "id": "bd436fca",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 742250 entries, 0 to 742249\n",
- "Data columns (total 7 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 742250 non-null int64 \n",
- " 1 purchase_date 742250 non-null datetime64[ns, UTC]\n",
- " 2 customer_id 742250 non-null int64 \n",
- " 3 created_at 742250 non-null object \n",
- " 4 updated_at 742250 non-null object \n",
- " 5 number 742250 non-null object \n",
- " 6 identifier 742250 non-null object \n",
- "dtypes: datetime64[ns, UTC](1), int64(2), object(4)\n",
- "memory usage: 39.6+ MB\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_purchases.info()"
]
@@ -2186,7 +456,7 @@
{
"cell_type": "code",
"execution_count": 22,
- "id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9",
+ "id": "83435862",
"metadata": {},
"outputs": [],
"source": [
@@ -2196,8 +466,10 @@
},
{
"cell_type": "markdown",
- "id": "53227600-c1c5-48aa-9f5d-db5a23a8a22a",
- "metadata": {},
+ "id": "f210e730",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
"source": [
"## Fusion de l'ensemble des données billétiques"
]
@@ -2205,7 +477,7 @@
{
"cell_type": "code",
"execution_count": 23,
- "id": "e0b8b47a-b321-4a79-823c-36a131a78ac7",
+ "id": "1f8b3aa7",
"metadata": {},
"outputs": [],
"source": [
@@ -2225,225 +497,21 @@
{
"cell_type": "code",
"execution_count": 24,
- "id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d",
+ "id": "83a4d021",
"metadata": {
"scrolled": true
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " ticket_id \n",
- " product_id \n",
- " is_from_subscription \n",
- " supplier_name \n",
- " type_of_ticket_name \n",
- " children \n",
- " purchase_date \n",
- " customer_id \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 13070859 \n",
- " 225251 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " 1 \n",
- " 13070860 \n",
- " 224914 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " 2 \n",
- " 13070861 \n",
- " 224914 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " 3 \n",
- " 13070862 \n",
- " 224914 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " 4 \n",
- " 13070863 \n",
- " 224914 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " \n",
- " \n",
- " 1826667 \n",
- " 20662815 \n",
- " 405689 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2023-11-08 17:23:54+00:00 \n",
- " 1256135 \n",
- " \n",
- " \n",
- " 1826668 \n",
- " 20662816 \n",
- " 403658 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2023-11-08 18:32:18+00:00 \n",
- " 1256136 \n",
- " \n",
- " \n",
- " 1826669 \n",
- " 20662817 \n",
- " 403658 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2023-11-08 18:32:18+00:00 \n",
- " 1256136 \n",
- " \n",
- " \n",
- " 1826670 \n",
- " 20662818 \n",
- " 403658 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2023-11-08 19:30:28+00:00 \n",
- " 1256137 \n",
- " \n",
- " \n",
- " 1826671 \n",
- " 20662819 \n",
- " 403658 \n",
- " False \n",
- " vente en ligne \n",
- " Atelier \n",
- " pricing_formula \n",
- " 2023-11-08 19:30:28+00:00 \n",
- " 1256137 \n",
- " \n",
- " \n",
- "
\n",
- "
1826672 rows × 8 columns
\n",
- "
"
- ],
- "text/plain": [
- " ticket_id product_id is_from_subscription supplier_name \\\n",
- "0 13070859 225251 False vente en ligne \n",
- "1 13070860 224914 False vente en ligne \n",
- "2 13070861 224914 False vente en ligne \n",
- "3 13070862 224914 False vente en ligne \n",
- "4 13070863 224914 False vente en ligne \n",
- "... ... ... ... ... \n",
- "1826667 20662815 405689 False vente en ligne \n",
- "1826668 20662816 403658 False vente en ligne \n",
- "1826669 20662817 403658 False vente en ligne \n",
- "1826670 20662818 403658 False vente en ligne \n",
- "1826671 20662819 403658 False vente en ligne \n",
- "\n",
- " type_of_ticket_name children purchase_date \\\n",
- "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
- "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
- "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
- "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
- "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
- "... ... ... ... \n",
- "1826667 Atelier pricing_formula 2023-11-08 17:23:54+00:00 \n",
- "1826668 Atelier pricing_formula 2023-11-08 18:32:18+00:00 \n",
- "1826669 Atelier pricing_formula 2023-11-08 18:32:18+00:00 \n",
- "1826670 Atelier pricing_formula 2023-11-08 19:30:28+00:00 \n",
- "1826671 Atelier pricing_formula 2023-11-08 19:30:28+00:00 \n",
- "\n",
- " customer_id \n",
- "0 48187 \n",
- "1 48187 \n",
- "2 48187 \n",
- "3 48187 \n",
- "4 48187 \n",
- "... ... \n",
- "1826667 1256135 \n",
- "1826668 1256136 \n",
- "1826669 1256136 \n",
- "1826670 1256137 \n",
- "1826671 1256137 \n",
- "\n",
- "[1826672 rows x 8 columns]"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
- "id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75",
- "metadata": {},
+ "id": "56e6ebd1",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
"source": [
"# Utilisation de fonctions"
]
@@ -2451,25 +519,9 @@
{
"cell_type": "code",
"execution_count": 51,
- "id": "c1afe322-ff41-4760-819e-0195fed5b27d",
+ "id": "88fcde4b",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 20 entries, 0 to 19\n",
- "Data columns (total 2 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 opened_at 8 non-null object \n",
- " 1 opened_at_clean 8 non-null datetime64[ns, UTC]\n",
- "dtypes: datetime64[ns, UTC](1), object(1)\n",
- "memory usage: 448.0+ bytes\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# Créer un DataFrame exemple\n",
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
@@ -2485,7 +537,7 @@
},
{
"cell_type": "markdown",
- "id": "27ecf058-23eb-4018-abbd-68c4ebe7c786",
+ "id": "818f69db",
"metadata": {},
"source": [
"## Nettoyage, selection et fusion"
@@ -2494,190 +546,9 @@
{
"cell_type": "code",
"execution_count": 23,
- "id": "d887898c-6a21-41ed-901d-4d6fdbca5372",
+ "id": "c9654eda",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " ticket_id \n",
- " product_id \n",
- " is_from_subscription \n",
- " type_of \n",
- " supplier_name \n",
- " purchase_date \n",
- " customer_id \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 13070859 \n",
- " 225251 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " 1 \n",
- " 13070860 \n",
- " 224914 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " 2 \n",
- " 13070861 \n",
- " 224914 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " 3 \n",
- " 13070862 \n",
- " 224914 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " 4 \n",
- " 13070863 \n",
- " 224914 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2018-12-28 14:47:50+00:00 \n",
- " 48187 \n",
- " \n",
- " \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " \n",
- " \n",
- " 1826667 \n",
- " 20662815 \n",
- " 405689 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2023-11-08 17:23:54+00:00 \n",
- " 1256135 \n",
- " \n",
- " \n",
- " 1826668 \n",
- " 20662816 \n",
- " 403658 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2023-11-08 18:32:18+00:00 \n",
- " 1256136 \n",
- " \n",
- " \n",
- " 1826669 \n",
- " 20662817 \n",
- " 403658 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2023-11-08 18:32:18+00:00 \n",
- " 1256136 \n",
- " \n",
- " \n",
- " 1826670 \n",
- " 20662818 \n",
- " 403658 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2023-11-08 19:30:28+00:00 \n",
- " 1256137 \n",
- " \n",
- " \n",
- " 1826671 \n",
- " 20662819 \n",
- " 403658 \n",
- " False \n",
- " 1 \n",
- " vente en ligne \n",
- " 2023-11-08 19:30:28+00:00 \n",
- " 1256137 \n",
- " \n",
- " \n",
- "
\n",
- "
1826672 rows × 7 columns
\n",
- "
"
- ],
- "text/plain": [
- " ticket_id product_id is_from_subscription type_of supplier_name \\\n",
- "0 13070859 225251 False 1 vente en ligne \n",
- "1 13070860 224914 False 1 vente en ligne \n",
- "2 13070861 224914 False 1 vente en ligne \n",
- "3 13070862 224914 False 1 vente en ligne \n",
- "4 13070863 224914 False 1 vente en ligne \n",
- "... ... ... ... ... ... \n",
- "1826667 20662815 405689 False 1 vente en ligne \n",
- "1826668 20662816 403658 False 1 vente en ligne \n",
- "1826669 20662817 403658 False 1 vente en ligne \n",
- "1826670 20662818 403658 False 1 vente en ligne \n",
- "1826671 20662819 403658 False 1 vente en ligne \n",
- "\n",
- " purchase_date customer_id \n",
- "0 2018-12-28 14:47:50+00:00 48187 \n",
- "1 2018-12-28 14:47:50+00:00 48187 \n",
- "2 2018-12-28 14:47:50+00:00 48187 \n",
- "3 2018-12-28 14:47:50+00:00 48187 \n",
- "4 2018-12-28 14:47:50+00:00 48187 \n",
- "... ... ... \n",
- "1826667 2023-11-08 17:23:54+00:00 1256135 \n",
- "1826668 2023-11-08 18:32:18+00:00 1256136 \n",
- "1826669 2023-11-08 18:32:18+00:00 1256136 \n",
- "1826670 2023-11-08 19:30:28+00:00 1256137 \n",
- "1826671 2023-11-08 19:30:28+00:00 1256137 \n",
- "\n",
- "[1826672 rows x 7 columns]"
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_ticket_information"
]
@@ -2685,37 +556,16 @@
{
"cell_type": "code",
"execution_count": 14,
- "id": "ac9a6373-c1c6-46b5-873b-dc22f17bcbdb",
+ "id": "7f2b620c",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 1826672 entries, 0 to 1826671\n",
- "Data columns (total 7 columns):\n",
- " # Column Dtype \n",
- "--- ------ ----- \n",
- " 0 ticket_id int64 \n",
- " 1 product_id int64 \n",
- " 2 is_from_subscription bool \n",
- " 3 type_of int64 \n",
- " 4 supplier_name object \n",
- " 5 purchase_date datetime64[ns, UTC]\n",
- " 6 customer_id int64 \n",
- "dtypes: bool(1), datetime64[ns, UTC](1), int64(4), object(1)\n",
- "memory usage: 85.4+ MB\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_ticket_information.info()"
]
},
{
"cell_type": "markdown",
- "id": "b1719943-89eb-4ba0-a107-2f96d5d01ec9",
+ "id": "637bdb72",
"metadata": {},
"source": [
"# Customer information"
@@ -2723,8 +573,10 @@
},
{
"cell_type": "markdown",
- "id": "a2132ee2-3f22-45fd-b65b-72689c8b672c",
- "metadata": {},
+ "id": "14c52894",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
"source": [
"## Target area"
]
@@ -2732,7 +584,7 @@
{
"cell_type": "code",
"execution_count": 8,
- "id": "da5d4708-7147-4cc8-8686-52d4bcba5a7a",
+ "id": "d83abfbf",
"metadata": {},
"outputs": [
{
@@ -2770,21 +622,9 @@
{
"cell_type": "code",
"execution_count": 62,
- "id": "b4fa5fe3-ce8e-4b0a-af94-fb468d241bad",
+ "id": "90d71b2c",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "id 5.080902\n",
- "dtype: float64"
- ]
- },
- "execution_count": 62,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
@@ -2796,7 +636,7 @@
{
"cell_type": "code",
"execution_count": 10,
- "id": "c74746de-0bf4-4b83-9a75-f1d3183abf1c",
+ "id": "2301de1e",
"metadata": {},
"outputs": [
{
@@ -2900,7 +740,7 @@
{
"cell_type": "code",
"execution_count": 14,
- "id": "47c55fa0-b2f3-46f9-9abf-c4ab66bd9fcb",
+ "id": "75fbc2f7",
"metadata": {},
"outputs": [
{
@@ -2945,7 +785,7 @@
{
"cell_type": "code",
"execution_count": 19,
- "id": "8af1aeb9-ebdd-4286-a14c-3b7d801ea172",
+ "id": "55cddf92",
"metadata": {},
"outputs": [
{
@@ -2996,7 +836,7 @@
{
"cell_type": "code",
"execution_count": 22,
- "id": "ceb069e5-76c9-46e4-9ea7-8c16eb4ed3cd",
+ "id": "7fd98a85",
"metadata": {},
"outputs": [
{
@@ -3032,7 +872,7 @@
{
"cell_type": "code",
"execution_count": 18,
- "id": "8bffef87-542e-4775-bc7c-2c0323fda581",
+ "id": "cf94bb1d",
"metadata": {},
"outputs": [
{
@@ -3104,7 +944,7 @@
},
{
"cell_type": "markdown",
- "id": "2f665824-a026-4acd-8358-b408a61854b4",
+ "id": "711d3884",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
@@ -3115,34 +955,9 @@
{
"cell_type": "code",
"execution_count": 52,
- "id": "5d05203c-ea30-4208-a29f-fef7737c672e",
+ "id": "c25b5295",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
- "/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
- "/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# campaign_stats cleaning \n",
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
@@ -3162,31 +977,9 @@
{
"cell_type": "code",
"execution_count": 53,
- "id": "8ac634cf-2a30-4ccc-a34d-0fd401a49aaa",
+ "id": "2a3de6a5",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 6214808 entries, 0 to 6214807\n",
- "Data columns (total 8 columns):\n",
- " # Column Dtype \n",
- "--- ------ ----- \n",
- " 0 id int64 \n",
- " 1 customer_id int64 \n",
- " 2 opened_at datetime64[ns, UTC]\n",
- " 3 sent_at datetime64[ns, UTC]\n",
- " 4 delivered_at datetime64[ns, UTC]\n",
- " 5 campaign_name object \n",
- " 6 campaign_service_id int64 \n",
- " 7 campaign_sent_at datetime64[ns, UTC]\n",
- "dtypes: datetime64[ns, UTC](4), int64(3), object(1)\n",
- "memory usage: 379.3+ MB\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns_full.info()"
]
@@ -3194,235 +987,16 @@
{
"cell_type": "code",
"execution_count": 56,
- "id": "7d22cdd5-2060-4922-8e04-27b613d4ee27",
+ "id": "3fc1f446",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " customer_id \n",
- " opened_at \n",
- " sent_at \n",
- " delivered_at \n",
- " campaign_name \n",
- " campaign_service_id \n",
- " campaign_sent_at \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 19793 \n",
- " 112597 \n",
- " NaT \n",
- " 2021-03-28 16:01:09+00:00 \n",
- " 2021-03-28 16:24:18+00:00 \n",
- " Le Mucem chez vous, gardons le lien #22 \n",
- " 404 \n",
- " 2021-03-27 23:00:00+00:00 \n",
- " \n",
- " \n",
- " 1 \n",
- " 14211 \n",
- " 113666 \n",
- " NaT \n",
- " 2021-03-28 16:01:09+00:00 \n",
- " 2021-03-28 16:21:02+00:00 \n",
- " Le Mucem chez vous, gardons le lien #22 \n",
- " 404 \n",
- " 2021-03-27 23:00:00+00:00 \n",
- " \n",
- " \n",
- " 2 \n",
- " 13150 \n",
- " 280561 \n",
- " NaT \n",
- " 2021-03-28 16:00:59+00:00 \n",
- " 2021-03-28 16:08:45+00:00 \n",
- " Le Mucem chez vous, gardons le lien #22 \n",
- " 404 \n",
- " 2021-03-27 23:00:00+00:00 \n",
- " \n",
- " \n",
- " 3 \n",
- " 7073 \n",
- " 101007 \n",
- " 2021-03-28 18:11:06+00:00 \n",
- " 2021-03-28 16:00:59+00:00 \n",
- " 2021-03-28 16:09:47+00:00 \n",
- " Le Mucem chez vous, gardons le lien #22 \n",
- " 404 \n",
- " 2021-03-27 23:00:00+00:00 \n",
- " \n",
- " \n",
- " 4 \n",
- " 5175 \n",
- " 103972 \n",
- " NaT \n",
- " 2021-03-28 16:01:06+00:00 \n",
- " 2021-03-28 16:05:03+00:00 \n",
- " Le Mucem chez vous, gardons le lien #22 \n",
- " 404 \n",
- " 2021-03-27 23:00:00+00:00 \n",
- " \n",
- " \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " \n",
- " \n",
- " 6214803 \n",
- " 8302994 \n",
- " 266155 \n",
- " 2023-10-23 09:43:25+00:00 \n",
- " 2023-10-23 09:32:33+00:00 \n",
- " 2023-10-23 09:32:34+00:00 \n",
- " dre_nov_2023 \n",
- " 1318 \n",
- " 2023-10-23 09:31:17+00:00 \n",
- " \n",
- " \n",
- " 6214804 \n",
- " 8303307 \n",
- " 21355 \n",
- " 2023-10-23 09:44:02+00:00 \n",
- " 2023-10-23 09:32:49+00:00 \n",
- " 2023-10-23 09:32:49+00:00 \n",
- " dre_nov_2023 \n",
- " 1318 \n",
- " 2023-10-23 09:31:17+00:00 \n",
- " \n",
- " \n",
- " 6214805 \n",
- " 8304346 \n",
- " 21849 \n",
- " 2023-10-23 09:45:52+00:00 \n",
- " 2023-10-23 09:33:28+00:00 \n",
- " 2023-10-23 09:33:29+00:00 \n",
- " dre_nov_2023 \n",
- " 1318 \n",
- " 2023-10-23 09:31:17+00:00 \n",
- " \n",
- " \n",
- " 6214806 \n",
- " 8302037 \n",
- " 667789 \n",
- " 2023-10-23 09:47:32+00:00 \n",
- " 2023-10-23 09:31:53+00:00 \n",
- " 2023-10-23 09:31:54+00:00 \n",
- " dre_nov_2023 \n",
- " 1318 \n",
- " 2023-10-23 09:31:17+00:00 \n",
- " \n",
- " \n",
- " 6214807 \n",
- " 8304939 \n",
- " 294154 \n",
- " NaT \n",
- " 2023-10-23 09:33:54+00:00 \n",
- " 2023-10-23 09:33:55+00:00 \n",
- " dre_nov_2023 \n",
- " 1318 \n",
- " 2023-10-23 09:31:17+00:00 \n",
- " \n",
- " \n",
- "
\n",
- "
6214808 rows × 8 columns
\n",
- "
"
- ],
- "text/plain": [
- " id customer_id opened_at \\\n",
- "0 19793 112597 NaT \n",
- "1 14211 113666 NaT \n",
- "2 13150 280561 NaT \n",
- "3 7073 101007 2021-03-28 18:11:06+00:00 \n",
- "4 5175 103972 NaT \n",
- "... ... ... ... \n",
- "6214803 8302994 266155 2023-10-23 09:43:25+00:00 \n",
- "6214804 8303307 21355 2023-10-23 09:44:02+00:00 \n",
- "6214805 8304346 21849 2023-10-23 09:45:52+00:00 \n",
- "6214806 8302037 667789 2023-10-23 09:47:32+00:00 \n",
- "6214807 8304939 294154 NaT \n",
- "\n",
- " sent_at delivered_at \\\n",
- "0 2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00 \n",
- "1 2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00 \n",
- "2 2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00 \n",
- "3 2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00 \n",
- "4 2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00 \n",
- "... ... ... \n",
- "6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00 \n",
- "6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00 \n",
- "6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00 \n",
- "6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00 \n",
- "6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00 \n",
- "\n",
- " campaign_name campaign_service_id \\\n",
- "0 Le Mucem chez vous, gardons le lien #22 404 \n",
- "1 Le Mucem chez vous, gardons le lien #22 404 \n",
- "2 Le Mucem chez vous, gardons le lien #22 404 \n",
- "3 Le Mucem chez vous, gardons le lien #22 404 \n",
- "4 Le Mucem chez vous, gardons le lien #22 404 \n",
- "... ... ... \n",
- "6214803 dre_nov_2023 1318 \n",
- "6214804 dre_nov_2023 1318 \n",
- "6214805 dre_nov_2023 1318 \n",
- "6214806 dre_nov_2023 1318 \n",
- "6214807 dre_nov_2023 1318 \n",
- "\n",
- " campaign_sent_at \n",
- "0 2021-03-27 23:00:00+00:00 \n",
- "1 2021-03-27 23:00:00+00:00 \n",
- "2 2021-03-27 23:00:00+00:00 \n",
- "3 2021-03-27 23:00:00+00:00 \n",
- "4 2021-03-27 23:00:00+00:00 \n",
- "... ... \n",
- "6214803 2023-10-23 09:31:17+00:00 \n",
- "6214804 2023-10-23 09:31:17+00:00 \n",
- "6214805 2023-10-23 09:31:17+00:00 \n",
- "6214806 2023-10-23 09:31:17+00:00 \n",
- "6214807 2023-10-23 09:31:17+00:00 \n",
- "\n",
- "[6214808 rows x 8 columns]"
- ]
- },
- "execution_count": 56,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "markdown",
- "id": "0a5b24f0-4bca-4cde-a6ba-eb130b38cac4",
+ "id": "20e69ee3",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
@@ -3433,264 +1007,9 @@
{
"cell_type": "code",
"execution_count": 37,
- "id": "bc63bc4e-6cc1-4d35-9635-faf55339e186",
+ "id": "d9cbdbce",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " name \n",
- " service_id \n",
- " created_at \n",
- " updated_at \n",
- " process_id \n",
- " report_url \n",
- " category \n",
- " to_be_synced \n",
- " identifier \n",
- " sent_at \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 1319613 \n",
- " newsletter enseignants janvier 2022 \n",
- " 721 \n",
- " 2022-01-14 16:06:42.586321+01:00 \n",
- " 2022-02-03 14:17:27.112963+01:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " aba3b6fd5d186d28e06ff97135cade7f \n",
- " 2022-01-14 00:00:00+01:00 \n",
- " \n",
- " \n",
- " 1 \n",
- " 1319586 \n",
- " lsf_janvier_2022 \n",
- " 717 \n",
- " 2022-01-07 11:30:35.315895+01:00 \n",
- " 2022-02-03 14:17:27.116171+01:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " 788d986905533aba051261497ecffcbb \n",
- " 2022-01-07 00:00:00+01:00 \n",
- " \n",
- " \n",
- " 2 \n",
- " 1319282 \n",
- " Invitation à déjeuner au Mucem | Vernissage « ... \n",
- " 591 \n",
- " 2021-09-28 12:50:24.448752+02:00 \n",
- " 2022-02-03 14:17:27.119582+01:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " 3493894fa4ea036cfc6433c3e2ee63b0 \n",
- " 2021-09-28 00:00:00+02:00 \n",
- " \n",
- " \n",
- " 3 \n",
- " 1319283 \n",
- " Vacances de la Toussaint - centres des loisirs \n",
- " 590 \n",
- " 2021-09-28 18:01:04.692073+02:00 \n",
- " 2022-02-03 14:17:27.124408+01:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " 08b255a5d42b89b0585260b6f2360bdd \n",
- " 2021-09-28 00:00:00+02:00 \n",
- " \n",
- " \n",
- " 4 \n",
- " 1319636 \n",
- " ddcp_promo_md_livemag \n",
- " 730 \n",
- " 2022-01-27 18:00:41.053069+01:00 \n",
- " 2022-02-03 14:17:27.127607+01:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " d5cfead94f5350c12c322b5b664544c1 \n",
- " 2022-01-27 00:00:00+01:00 \n",
- " \n",
- " \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " \n",
- " \n",
- " 952 \n",
- " 1320072 \n",
- " dre_gaza0106 \n",
- " 881 \n",
- " 2022-05-26 09:01:35.523639+02:00 \n",
- " 2022-12-02 17:51:22.614046+01:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " 7504adad8bb96320eb3afdd4df6e1f60 \n",
- " 2022-05-26 00:00:00+02:00 \n",
- " \n",
- " \n",
- " 953 \n",
- " 661398 \n",
- " DDCP Plan Bis 4 - Marketing direct - MJ5C \n",
- " 183 \n",
- " 2021-06-18 10:30:01.259578+02:00 \n",
- " 2021-09-24 11:56:09.082785+02:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " cedebb6e872f539bef8c3f919874e9d7 \n",
- " 2020-07-27 00:00:00+02:00 \n",
- " \n",
- " \n",
- " 954 \n",
- " 1320487 \n",
- " Invitation portes ouvertes amitiés \n",
- " 988 \n",
- " 2022-09-29 18:01:33.834090+02:00 \n",
- " 2022-12-02 17:51:23.258324+01:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " 9908279ebbf1f9b250ba689db6a0222b \n",
- " 2022-09-29 00:00:00+02:00 \n",
- " \n",
- " \n",
- " 955 \n",
- " 906903 \n",
- " DDCP PROMO La méditerranée des philosophes #3 ... \n",
- " 310 \n",
- " 2021-07-19 14:07:16.177390+02:00 \n",
- " 2021-09-24 11:56:09.086101+02:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " 06eb61b839a0cefee4967c67ccb099dc \n",
- " 2020-12-23 00:00:00+01:00 \n",
- " \n",
- " \n",
- " 956 \n",
- " 579313 \n",
- " ddcp_promo_automation_manuel_pre_visit \n",
- " 481 \n",
- " 2021-06-08 17:38:54.041310+02:00 \n",
- " 2021-09-24 11:56:09.089394+02:00 \n",
- " NaN \n",
- " NaN \n",
- " 0.0 \n",
- " False \n",
- " 9461cce28ebe3e76fb4b931c35a169b0 \n",
- " 2021-06-08 00:00:00+02:00 \n",
- " \n",
- " \n",
- "
\n",
- "
957 rows × 11 columns
\n",
- "
"
- ],
- "text/plain": [
- " id name service_id \\\n",
- "0 1319613 newsletter enseignants janvier 2022 721 \n",
- "1 1319586 lsf_janvier_2022 717 \n",
- "2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n",
- "3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n",
- "4 1319636 ddcp_promo_md_livemag 730 \n",
- ".. ... ... ... \n",
- "952 1320072 dre_gaza0106 881 \n",
- "953 661398 DDCP Plan Bis 4 - Marketing direct - MJ5C 183 \n",
- "954 1320487 Invitation portes ouvertes amitiés 988 \n",
- "955 906903 DDCP PROMO La méditerranée des philosophes #3 ... 310 \n",
- "956 579313 ddcp_promo_automation_manuel_pre_visit 481 \n",
- "\n",
- " created_at updated_at \\\n",
- "0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n",
- "1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n",
- "2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n",
- "3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n",
- "4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n",
- ".. ... ... \n",
- "952 2022-05-26 09:01:35.523639+02:00 2022-12-02 17:51:22.614046+01:00 \n",
- "953 2021-06-18 10:30:01.259578+02:00 2021-09-24 11:56:09.082785+02:00 \n",
- "954 2022-09-29 18:01:33.834090+02:00 2022-12-02 17:51:23.258324+01:00 \n",
- "955 2021-07-19 14:07:16.177390+02:00 2021-09-24 11:56:09.086101+02:00 \n",
- "956 2021-06-08 17:38:54.041310+02:00 2021-09-24 11:56:09.089394+02:00 \n",
- "\n",
- " process_id report_url category to_be_synced \\\n",
- "0 NaN NaN 0.0 False \n",
- "1 NaN NaN 0.0 False \n",
- "2 NaN NaN 0.0 False \n",
- "3 NaN NaN 0.0 False \n",
- "4 NaN NaN 0.0 False \n",
- ".. ... ... ... ... \n",
- "952 NaN NaN 0.0 False \n",
- "953 NaN NaN 0.0 False \n",
- "954 NaN NaN 0.0 False \n",
- "955 NaN NaN 0.0 False \n",
- "956 NaN NaN 0.0 False \n",
- "\n",
- " identifier sent_at \n",
- "0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n",
- "1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n",
- "2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n",
- "3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n",
- "4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 \n",
- ".. ... ... \n",
- "952 7504adad8bb96320eb3afdd4df6e1f60 2022-05-26 00:00:00+02:00 \n",
- "953 cedebb6e872f539bef8c3f919874e9d7 2020-07-27 00:00:00+02:00 \n",
- "954 9908279ebbf1f9b250ba689db6a0222b 2022-09-29 00:00:00+02:00 \n",
- "955 06eb61b839a0cefee4967c67ccb099dc 2020-12-23 00:00:00+01:00 \n",
- "956 9461cce28ebe3e76fb4b931c35a169b0 2021-06-08 00:00:00+02:00 \n",
- "\n",
- "[957 rows x 11 columns]"
- ]
- },
- "execution_count": 37,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns"
]
@@ -3698,185 +1017,16 @@
{
"cell_type": "code",
"execution_count": 38,
- "id": "c19b321f-65f9-4d6c-8c1f-edb2eb9d70e7",
+ "id": "c07459f0",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id \n",
- " clicked_at \n",
- " link_id \n",
- " customer_id \n",
- " created_at \n",
- " updated_at \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " 1 \n",
- " 2021-03-26 16:30:36+01:00 \n",
- " 1 \n",
- " 284033 \n",
- " 2021-03-26 15:30:37.050161+01:00 \n",
- " 2021-03-26 15:30:37.050161+01:00 \n",
- " \n",
- " \n",
- " 1 \n",
- " 2 \n",
- " 2021-03-26 17:16:34+01:00 \n",
- " 2 \n",
- " 119768 \n",
- " 2021-03-26 16:16:34.950871+01:00 \n",
- " 2021-03-26 16:16:34.950871+01:00 \n",
- " \n",
- " \n",
- " 2 \n",
- " 272 \n",
- " 2021-03-28 20:03:32+02:00 \n",
- " 42 \n",
- " 113105 \n",
- " 2021-03-28 18:03:32.736394+02:00 \n",
- " 2021-03-28 18:03:32.736394+02:00 \n",
- " \n",
- " \n",
- " 3 \n",
- " 4 \n",
- " 2021-03-26 17:43:19+01:00 \n",
- " 3 \n",
- " 272280 \n",
- " 2021-03-26 16:43:19.338321+01:00 \n",
- " 2021-03-26 16:43:19.338321+01:00 \n",
- " \n",
- " \n",
- " 4 \n",
- " 5 \n",
- " 2021-03-26 17:46:00+01:00 \n",
- " 3 \n",
- " 105095 \n",
- " 2021-03-26 16:46:00.502945+01:00 \n",
- " 2021-03-26 16:46:00.502945+01:00 \n",
- " \n",
- " \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " \n",
- " \n",
- " 151046 \n",
- " 243553 \n",
- " 2023-11-09 16:34:27+01:00 \n",
- " 14666 \n",
- " 998 \n",
- " 2023-11-09 15:34:29.425425+01:00 \n",
- " 2023-11-09 15:34:29.425425+01:00 \n",
- " \n",
- " \n",
- " 151047 \n",
- " 243554 \n",
- " 2023-11-09 16:34:35+01:00 \n",
- " 14670 \n",
- " 998 \n",
- " 2023-11-09 15:34:37.505505+01:00 \n",
- " 2023-11-09 15:34:37.505505+01:00 \n",
- " \n",
- " \n",
- " 151048 \n",
- " 243559 \n",
- " 2023-11-09 16:51:15+01:00 \n",
- " 14686 \n",
- " 82923 \n",
- " 2023-11-09 15:51:17.439518+01:00 \n",
- " 2023-11-09 15:51:17.439518+01:00 \n",
- " \n",
- " \n",
- " 151049 \n",
- " 243561 \n",
- " 2023-11-09 16:59:42+01:00 \n",
- " 14677 \n",
- " 82923 \n",
- " 2023-11-09 15:59:44.030922+01:00 \n",
- " 2023-11-09 15:59:44.030922+01:00 \n",
- " \n",
- " \n",
- " 151050 \n",
- " 243564 \n",
- " 2023-11-09 17:16:41+01:00 \n",
- " 14691 \n",
- " 1254355 \n",
- " 2023-11-09 16:16:43.012932+01:00 \n",
- " 2023-11-09 16:16:43.012932+01:00 \n",
- " \n",
- " \n",
- "
\n",
- "
151051 rows × 6 columns
\n",
- "
"
- ],
- "text/plain": [
- " id clicked_at link_id customer_id \\\n",
- "0 1 2021-03-26 16:30:36+01:00 1 284033 \n",
- "1 2 2021-03-26 17:16:34+01:00 2 119768 \n",
- "2 272 2021-03-28 20:03:32+02:00 42 113105 \n",
- "3 4 2021-03-26 17:43:19+01:00 3 272280 \n",
- "4 5 2021-03-26 17:46:00+01:00 3 105095 \n",
- "... ... ... ... ... \n",
- "151046 243553 2023-11-09 16:34:27+01:00 14666 998 \n",
- "151047 243554 2023-11-09 16:34:35+01:00 14670 998 \n",
- "151048 243559 2023-11-09 16:51:15+01:00 14686 82923 \n",
- "151049 243561 2023-11-09 16:59:42+01:00 14677 82923 \n",
- "151050 243564 2023-11-09 17:16:41+01:00 14691 1254355 \n",
- "\n",
- " created_at updated_at \n",
- "0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n",
- "1 2021-03-26 16:16:34.950871+01:00 2021-03-26 16:16:34.950871+01:00 \n",
- "2 2021-03-28 18:03:32.736394+02:00 2021-03-28 18:03:32.736394+02:00 \n",
- "3 2021-03-26 16:43:19.338321+01:00 2021-03-26 16:43:19.338321+01:00 \n",
- "4 2021-03-26 16:46:00.502945+01:00 2021-03-26 16:46:00.502945+01:00 \n",
- "... ... ... \n",
- "151046 2023-11-09 15:34:29.425425+01:00 2023-11-09 15:34:29.425425+01:00 \n",
- "151047 2023-11-09 15:34:37.505505+01:00 2023-11-09 15:34:37.505505+01:00 \n",
- "151048 2023-11-09 15:51:17.439518+01:00 2023-11-09 15:51:17.439518+01:00 \n",
- "151049 2023-11-09 15:59:44.030922+01:00 2023-11-09 15:59:44.030922+01:00 \n",
- "151050 2023-11-09 16:16:43.012932+01:00 2023-11-09 16:16:43.012932+01:00 \n",
- "\n",
- "[151051 rows x 6 columns]"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_link_stats"
]
},
{
"cell_type": "markdown",
- "id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6",
+ "id": "80ae4c42",
"metadata": {},
"source": [
"## Exploration variables"
@@ -3884,8 +1034,8 @@
},
{
"cell_type": "code",
- "execution_count": 28,
- "id": "aaa41688-ea7e-4dba-851c-1f0b0ec43c71",
+ "execution_count": 7,
+ "id": "b50b8f95",
"metadata": {},
"outputs": [],
"source": [
@@ -3907,8 +1057,8 @@
},
{
"cell_type": "code",
- "execution_count": 29,
- "id": "2fecc2e1-113f-46ed-9065-0b9ee416166e",
+ "execution_count": 8,
+ "id": "7e292935",
"metadata": {},
"outputs": [],
"source": [
@@ -3917,8 +1067,8 @@
},
{
"cell_type": "code",
- "execution_count": 30,
- "id": "55f6170a-36fb-4efb-9810-f982883660cf",
+ "execution_count": 9,
+ "id": "05b6f2b0",
"metadata": {},
"outputs": [
{
@@ -3965,7 +1115,7 @@
"0 9 100.0 100.0 100.0"
]
},
- "execution_count": 30,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -3976,8 +1126,8 @@
},
{
"cell_type": "code",
- "execution_count": 31,
- "id": "0030fd02-09e3-42f5-9c83-290458a38c29",
+ "execution_count": 10,
+ "id": "c9324d80",
"metadata": {},
"outputs": [],
"source": [
@@ -3991,8 +1141,8 @@
},
{
"cell_type": "code",
- "execution_count": 32,
- "id": "6b1736d1-8fd7-4fcc-9431-b8bf0c7b4f2b",
+ "execution_count": 11,
+ "id": "10304058",
"metadata": {},
"outputs": [
{
@@ -4015,8 +1165,8 @@
},
{
"cell_type": "code",
- "execution_count": 33,
- "id": "226b694b-0b00-4167-b69f-3178902254eb",
+ "execution_count": 32,
+ "id": "ffa423e5",
"metadata": {},
"outputs": [],
"source": [
@@ -4024,19 +1174,103 @@
"def database_loading(database_name = None):\n",
" files_path = database_name\n",
" \n",
- " client_number = files_path[0].split(\"/\")[1]\n",
+ " client_number = files_path.split(\"/\")[1]\n",
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
" \n",
- " for i in range(len(files_path)) :\n",
- " current_path = files_path[i]\n",
- " with fs.open(current_path, mode=\"rb\") as file_in:\n",
- " df = pd.read_csv(file_in)\n",
- " # the pattern of the name is df1xxx\n",
- " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
- " globals()[nom_dataframe] = df\n",
+ " current_path = files_path\n",
+ " with fs.open(current_path, mode=\"rb\") as file_in:\n",
+ " df = pd.read_csv(file_in)\n",
"\n",
+ " return df, client_number"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "70bdc88d",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "6a0f567d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_all = pd.DataFrame()\n",
+ "\n",
+ "for link in liste_suppliers:\n",
+ " \n",
+ " df_supplier, tenant_id = database_loading(link)\n",
+ " \n",
+ " df_supplier['tenant_id'] = int(tenant_id)\n",
+ "\n",
+ " df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
" "
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "id": "1522d8cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# df_all[df_all['tenant_id'] == 101]['name'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "b0e42a61",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
+ "# vad = vente à distance\n",
+ "df_all['name'] = df_all['name'].fillna('')\n",
+ "\n",
+ "df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "id": "d299ae91",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "tenant_id\n",
+ "1 1\n",
+ "2 1\n",
+ "3 1\n",
+ "4 1\n",
+ "5 1\n",
+ "6 1\n",
+ "7 1\n",
+ "8 1\n",
+ "9 1\n",
+ "10 1\n",
+ "11 1\n",
+ "12 1\n",
+ "13 1\n",
+ "14 1\n",
+ "101 1\n",
+ "Name: canal_vente_internet, dtype: int64"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_all.groupby('tenant_id')['canal_vente_internet'].max()"
+ ]
}
],
"metadata": {