diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb
index 3f3b639..a3018ba 100644
--- a/0_Cleaning_and_merge.ipynb
+++ b/0_Cleaning_and_merge.ipynb
@@ -10,7 +10,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 28,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
@@ -119,19 +119,10 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
- " df = pd.read_csv(file_in)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
@@ -158,7 +149,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
"metadata": {},
"outputs": [],
@@ -215,7 +206,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
"metadata": {},
"outputs": [],
@@ -258,218 +249,20 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/1320335767.py:5: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
- "/tmp/ipykernel_50143/1320335767.py:9: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
]
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": null,
"id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " ticket_id | \n",
- " product_id | \n",
- " is_from_subscription | \n",
- " type_of | \n",
- " supplier_name | \n",
- " purchase_date | \n",
- " customer_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 13070859 | \n",
- " 225251 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 13070860 | \n",
- " 224914 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 13070861 | \n",
- " 224914 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 13070862 | \n",
- " 224914 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 13070863 | \n",
- " 224914 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1826667 | \n",
- " 20662815 | \n",
- " 405689 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 17:23:54+00:00 | \n",
- " 1256135 | \n",
- "
\n",
- " \n",
- " 1826668 | \n",
- " 20662816 | \n",
- " 403658 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 18:32:18+00:00 | \n",
- " 1256136 | \n",
- "
\n",
- " \n",
- " 1826669 | \n",
- " 20662817 | \n",
- " 403658 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 18:32:18+00:00 | \n",
- " 1256136 | \n",
- "
\n",
- " \n",
- " 1826670 | \n",
- " 20662818 | \n",
- " 403658 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 19:30:28+00:00 | \n",
- " 1256137 | \n",
- "
\n",
- " \n",
- " 1826671 | \n",
- " 20662819 | \n",
- " 403658 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 19:30:28+00:00 | \n",
- " 1256137 | \n",
- "
\n",
- " \n",
- "
\n",
- "
1826672 rows × 7 columns
\n",
- "
"
- ],
- "text/plain": [
- " ticket_id product_id is_from_subscription type_of supplier_name \\\n",
- "0 13070859 225251 False 1 vente en ligne \n",
- "1 13070860 224914 False 1 vente en ligne \n",
- "2 13070861 224914 False 1 vente en ligne \n",
- "3 13070862 224914 False 1 vente en ligne \n",
- "4 13070863 224914 False 1 vente en ligne \n",
- "... ... ... ... ... ... \n",
- "1826667 20662815 405689 False 1 vente en ligne \n",
- "1826668 20662816 403658 False 1 vente en ligne \n",
- "1826669 20662817 403658 False 1 vente en ligne \n",
- "1826670 20662818 403658 False 1 vente en ligne \n",
- "1826671 20662819 403658 False 1 vente en ligne \n",
- "\n",
- " purchase_date customer_id \n",
- "0 2018-12-28 14:47:50+00:00 48187 \n",
- "1 2018-12-28 14:47:50+00:00 48187 \n",
- "2 2018-12-28 14:47:50+00:00 48187 \n",
- "3 2018-12-28 14:47:50+00:00 48187 \n",
- "4 2018-12-28 14:47:50+00:00 48187 \n",
- "... ... ... \n",
- "1826667 2023-11-08 17:23:54+00:00 1256135 \n",
- "1826668 2023-11-08 18:32:18+00:00 1256136 \n",
- "1826669 2023-11-08 18:32:18+00:00 1256136 \n",
- "1826670 2023-11-08 19:30:28+00:00 1256137 \n",
- "1826671 2023-11-08 19:30:28+00:00 1256137 \n",
- "\n",
- "[1826672 rows x 7 columns]"
- ]
- },
- "execution_count": 70,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_ticket_information"
]
@@ -484,7 +277,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
"metadata": {},
"outputs": [],
@@ -513,413 +306,32 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/3848597476.py:4: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": null,
"id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " customer_id | \n",
- "
\n",
- " \n",
- " target_name | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " consentement optin mediation specialisee | \n",
- " 150000 | \n",
- "
\n",
- " \n",
- " consentement optin jeune public | \n",
- " 149979 | \n",
- "
\n",
- " \n",
- " consentement optin b2c | \n",
- " 108909 | \n",
- "
\n",
- " \n",
- " Arenametrix_bascule tel vers sib | \n",
- " 35216 | \n",
- "
\n",
- " \n",
- " consentement optout b2c | \n",
- " 34523 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " Automation_parrainage_newsletter_handicap_visuel | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " consentement optout mediation specialisee | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " Inscrits NL LSF formulaire | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " Market auto - contacts inactifs post-scénario | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " Inactifs - fin du scénario | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
283 rows × 1 columns
\n",
- "
"
- ],
- "text/plain": [
- " customer_id\n",
- "target_name \n",
- "consentement optin mediation specialisee 150000\n",
- "consentement optin jeune public 149979\n",
- "consentement optin b2c 108909\n",
- "Arenametrix_bascule tel vers sib 35216\n",
- "consentement optout b2c 34523\n",
- "... ...\n",
- "Automation_parrainage_newsletter_handicap_visuel 1\n",
- "consentement optout mediation specialisee 1\n",
- "Inscrits NL LSF formulaire 1\n",
- "Market auto - contacts inactifs post-scénario 1\n",
- "Inactifs - fin du scénario 1\n",
- "\n",
- "[283 rows x 1 columns]"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False)"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": null,
"id": "4417ff51-f501-4ab9-a192-4ab75764a8ed",
"metadata": {
"scrolled": true
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " customer_id | \n",
- "
\n",
- " \n",
- " target_name | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Arenametrix_bascule tel vers sib | \n",
- " 35216 | \n",
- "
\n",
- " \n",
- " Autres_interet_exposition | \n",
- " 1021 | \n",
- "
\n",
- " \n",
- " COM Inscrits NL générale (historique) | \n",
- " 23005 | \n",
- "
\n",
- " \n",
- " Contacts_prenomsdoubles | \n",
- " 11643 | \n",
- "
\n",
- " \n",
- " DDCP MD Procès du Siècle | \n",
- " 1684 | \n",
- "
\n",
- " \n",
- " DDCP Newsletter centres de loisirs | \n",
- " 1032 | \n",
- "
\n",
- " \n",
- " DDCP Newsletter enseignants | \n",
- " 4510 | \n",
- "
\n",
- " \n",
- " DDCP Newsletter jeune public | \n",
- " 3862 | \n",
- "
\n",
- " \n",
- " DDCP Newsletter relais champ social | \n",
- " 2270 | \n",
- "
\n",
- " \n",
- " DDCP PROMO Participants ateliers (adultes et enfants) | \n",
- " 1954 | \n",
- "
\n",
- " \n",
- " DDCP billets famille | \n",
- " 3609 | \n",
- "
\n",
- " \n",
- " DDCP promo MD pass musées dps oct 2018 | \n",
- " 1785 | \n",
- "
\n",
- " \n",
- " DDCP promo Plan B 2019 (concerts) | \n",
- " 1948 | \n",
- "
\n",
- " \n",
- " DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers) | \n",
- " 1293 | \n",
- "
\n",
- " \n",
- " DDCP rentrée culturelle 2023 | \n",
- " 1757 | \n",
- "
\n",
- " \n",
- " DDCP_marseille_jazz_2023 | \n",
- " 1043 | \n",
- "
\n",
- " \n",
- " DRE Festival Jean Rouch | \n",
- " 1502 | \n",
- "
\n",
- " \n",
- " DRE MucemLab | \n",
- " 2302 | \n",
- "
\n",
- " \n",
- " DRE chercheurs | \n",
- " 1557 | \n",
- "
\n",
- " \n",
- " DRE institutionnels | \n",
- " 2229 | \n",
- "
\n",
- " \n",
- " FORMATION _ acheteurs optin last year | \n",
- " 10485 | \n",
- "
\n",
- " \n",
- " Inscrits NL générale (export_291019 + operation_videomaton) | \n",
- " 14086 | \n",
- "
\n",
- " \n",
- " Inscrits NL générale site web | \n",
- " 3732 | \n",
- "
\n",
- " \n",
- " Inscrits NL jeune public site web | \n",
- " 1249 | \n",
- "
\n",
- " \n",
- " Votre première liste | \n",
- " 3715 | \n",
- "
\n",
- " \n",
- " consentement optin b2b | \n",
- " 12735 | \n",
- "
\n",
- " \n",
- " consentement optin b2c | \n",
- " 108909 | \n",
- "
\n",
- " \n",
- " consentement optin dre | \n",
- " 4527 | \n",
- "
\n",
- " \n",
- " consentement optin jeune public | \n",
- " 149979 | \n",
- "
\n",
- " \n",
- " consentement optin mediation specialisee | \n",
- " 150000 | \n",
- "
\n",
- " \n",
- " consentement optin newsletter generale | \n",
- " 22095 | \n",
- "
\n",
- " \n",
- " consentement optin scolaires | \n",
- " 4849 | \n",
- "
\n",
- " \n",
- " consentement optout b2b | \n",
- " 14219 | \n",
- "
\n",
- " \n",
- " consentement optout b2c | \n",
- " 34523 | \n",
- "
\n",
- " \n",
- " consentement optout dre | \n",
- " 14328 | \n",
- "
\n",
- " \n",
- " consentement optout newsletter generale | \n",
- " 18855 | \n",
- "
\n",
- " \n",
- " consentement optout scolaires | \n",
- " 15744 | \n",
- "
\n",
- " \n",
- " ddcp_md_scene_ouverte_au_talent | \n",
- " 1577 | \n",
- "
\n",
- " \n",
- " ddcp_promo_MD_billet_musée_oct_2019_agarder2 | \n",
- " 5482 | \n",
- "
\n",
- " \n",
- " ddcp_promo_md_musée_dps 011019 | \n",
- " 6010 | \n",
- "
\n",
- " \n",
- " ddcp_promo_visiteurs occasionnels_musee_8mois | \n",
- " 6640 | \n",
- "
\n",
- " \n",
- " ddcp_visiteurs dps 010622 | \n",
- " 12355 | \n",
- "
\n",
- " \n",
- " festival_jean_rouch | \n",
- " 1502 | \n",
- "
\n",
- " \n",
- " rappel po barvalo | \n",
- " 1248 | \n",
- "
\n",
- " \n",
- " structures_etiquette champ social | \n",
- " 1488 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " customer_id\n",
- "target_name \n",
- "Arenametrix_bascule tel vers sib 35216\n",
- "Autres_interet_exposition 1021\n",
- "COM Inscrits NL générale (historique) 23005\n",
- "Contacts_prenomsdoubles 11643\n",
- "DDCP MD Procès du Siècle 1684\n",
- "DDCP Newsletter centres de loisirs 1032\n",
- "DDCP Newsletter enseignants 4510\n",
- "DDCP Newsletter jeune public 3862\n",
- "DDCP Newsletter relais champ social 2270\n",
- "DDCP PROMO Participants ateliers (adultes et en... 1954\n",
- "DDCP billets famille 3609\n",
- "DDCP promo MD pass musées dps oct 2018 1785\n",
- "DDCP promo Plan B 2019 (concerts) 1948\n",
- "DDCP promo spectateurs prog 21-22 (spectacles, ... 1293\n",
- "DDCP rentrée culturelle 2023 1757\n",
- "DDCP_marseille_jazz_2023 1043\n",
- "DRE Festival Jean Rouch 1502\n",
- "DRE MucemLab 2302\n",
- "DRE chercheurs 1557\n",
- "DRE institutionnels 2229\n",
- "FORMATION _ acheteurs optin last year 10485\n",
- "Inscrits NL générale (export_291019 + operation... 14086\n",
- "Inscrits NL générale site web 3732\n",
- "Inscrits NL jeune public site web 1249\n",
- "Votre première liste 3715\n",
- "consentement optin b2b 12735\n",
- "consentement optin b2c 108909\n",
- "consentement optin dre 4527\n",
- "consentement optin jeune public 149979\n",
- "consentement optin mediation specialisee 150000\n",
- "consentement optin newsletter generale 22095\n",
- "consentement optin scolaires 4849\n",
- "consentement optout b2b 14219\n",
- "consentement optout b2c 34523\n",
- "consentement optout dre 14328\n",
- "consentement optout newsletter generale 18855\n",
- "consentement optout scolaires 15744\n",
- "ddcp_md_scene_ouverte_au_talent 1577\n",
- "ddcp_promo_MD_billet_musée_oct_2019_agarder2 5482\n",
- "ddcp_promo_md_musée_dps 011019 6010\n",
- "ddcp_promo_visiteurs occasionnels_musee_8mois 6640\n",
- "ddcp_visiteurs dps 010622 12355\n",
- "festival_jean_rouch 1502\n",
- "rappel po barvalo 1248\n",
- "structures_etiquette champ social 1488"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n",
"df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000]"
@@ -935,7 +347,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
"metadata": {},
"outputs": [],
@@ -960,271 +372,27 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
- "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
- "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
]
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": null,
"id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " customer_id | \n",
- " opened_at | \n",
- " sent_at | \n",
- " delivered_at | \n",
- " campaign_name | \n",
- " campaign_service_id | \n",
- " campaign_sent_at | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 19793 | \n",
- " 112597 | \n",
- " NaT | \n",
- " 2021-03-28 16:01:09+00:00 | \n",
- " 2021-03-28 16:24:18+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 14211 | \n",
- " 113666 | \n",
- " NaT | \n",
- " 2021-03-28 16:01:09+00:00 | \n",
- " 2021-03-28 16:21:02+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 13150 | \n",
- " 280561 | \n",
- " NaT | \n",
- " 2021-03-28 16:00:59+00:00 | \n",
- " 2021-03-28 16:08:45+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 7073 | \n",
- " 101007 | \n",
- " 2021-03-28 18:11:06+00:00 | \n",
- " 2021-03-28 16:00:59+00:00 | \n",
- " 2021-03-28 16:09:47+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 5175 | \n",
- " 103972 | \n",
- " NaT | \n",
- " 2021-03-28 16:01:06+00:00 | \n",
- " 2021-03-28 16:05:03+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 6214803 | \n",
- " 8302994 | \n",
- " 266155 | \n",
- " 2023-10-23 09:43:25+00:00 | \n",
- " 2023-10-23 09:32:33+00:00 | \n",
- " 2023-10-23 09:32:34+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- " 6214804 | \n",
- " 8303307 | \n",
- " 21355 | \n",
- " 2023-10-23 09:44:02+00:00 | \n",
- " 2023-10-23 09:32:49+00:00 | \n",
- " 2023-10-23 09:32:49+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- " 6214805 | \n",
- " 8304346 | \n",
- " 21849 | \n",
- " 2023-10-23 09:45:52+00:00 | \n",
- " 2023-10-23 09:33:28+00:00 | \n",
- " 2023-10-23 09:33:29+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- " 6214806 | \n",
- " 8302037 | \n",
- " 667789 | \n",
- " 2023-10-23 09:47:32+00:00 | \n",
- " 2023-10-23 09:31:53+00:00 | \n",
- " 2023-10-23 09:31:54+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- " 6214807 | \n",
- " 8304939 | \n",
- " 294154 | \n",
- " NaT | \n",
- " 2023-10-23 09:33:54+00:00 | \n",
- " 2023-10-23 09:33:55+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- "
\n",
- "
6214808 rows × 8 columns
\n",
- "
"
- ],
- "text/plain": [
- " id customer_id opened_at \\\n",
- "0 19793 112597 NaT \n",
- "1 14211 113666 NaT \n",
- "2 13150 280561 NaT \n",
- "3 7073 101007 2021-03-28 18:11:06+00:00 \n",
- "4 5175 103972 NaT \n",
- "... ... ... ... \n",
- "6214803 8302994 266155 2023-10-23 09:43:25+00:00 \n",
- "6214804 8303307 21355 2023-10-23 09:44:02+00:00 \n",
- "6214805 8304346 21849 2023-10-23 09:45:52+00:00 \n",
- "6214806 8302037 667789 2023-10-23 09:47:32+00:00 \n",
- "6214807 8304939 294154 NaT \n",
- "\n",
- " sent_at delivered_at \\\n",
- "0 2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00 \n",
- "1 2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00 \n",
- "2 2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00 \n",
- "3 2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00 \n",
- "4 2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00 \n",
- "... ... ... \n",
- "6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00 \n",
- "6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00 \n",
- "6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00 \n",
- "6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00 \n",
- "6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00 \n",
- "\n",
- " campaign_name campaign_service_id \\\n",
- "0 Le Mucem chez vous, gardons le lien #22 404 \n",
- "1 Le Mucem chez vous, gardons le lien #22 404 \n",
- "2 Le Mucem chez vous, gardons le lien #22 404 \n",
- "3 Le Mucem chez vous, gardons le lien #22 404 \n",
- "4 Le Mucem chez vous, gardons le lien #22 404 \n",
- "... ... ... \n",
- "6214803 dre_nov_2023 1318 \n",
- "6214804 dre_nov_2023 1318 \n",
- "6214805 dre_nov_2023 1318 \n",
- "6214806 dre_nov_2023 1318 \n",
- "6214807 dre_nov_2023 1318 \n",
- "\n",
- " campaign_sent_at \n",
- "0 2021-03-27 23:00:00+00:00 \n",
- "1 2021-03-27 23:00:00+00:00 \n",
- "2 2021-03-27 23:00:00+00:00 \n",
- "3 2021-03-27 23:00:00+00:00 \n",
- "4 2021-03-27 23:00:00+00:00 \n",
- "... ... \n",
- "6214803 2023-10-23 09:31:17+00:00 \n",
- "6214804 2023-10-23 09:31:17+00:00 \n",
- "6214805 2023-10-23 09:31:17+00:00 \n",
- "6214806 2023-10-23 09:31:17+00:00 \n",
- "6214807 2023-10-23 09:31:17+00:00 \n",
- "\n",
- "[6214808 rows x 8 columns]"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "code",
- "execution_count": 67,
+ "execution_count": null,
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
"metadata": {},
"outputs": [],
@@ -1258,39 +426,239 @@
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": null,
"id": "24537647-bc29-4777-9848-ac4120a4aa60",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/2679359833.py:11: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
- "/tmp/ipykernel_50143/2679359833.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
- "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
- "\n",
- "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
- "\n",
- "\n",
- " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns_kpi = campaigns_kpi(campaigns_information = df1_campaigns_information) "
]
},
{
"cell_type": "code",
- "execution_count": 66,
+ "execution_count": null,
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
"metadata": {},
+ "outputs": [],
+ "source": [
+ "df1_campaigns_kpi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "56520a97-ede8-4920-a211-3b5b136af33d",
+ "metadata": {},
+ "source": [
+ "## Create Products Table"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9782e9d3-ba20-46bf-8562-bd0969972ddc",
+ "metadata": {},
+ "source": [
+ "Some useful functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "BUCKET = \"bdc2324-data\"\n",
+ "directory_path = '1'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def display_databases(file_name):\n",
+ " \"\"\"\n",
+ " This function returns the file from s3 storage\n",
+ " \"\"\"\n",
+ " file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
+ " print(\"File path : \", file_path)\n",
+ " with fs.open(file_path, mode=\"rb\") as file_in:\n",
+ " df = pd.read_csv(file_in, sep=\",\")\n",
+ " \n",
+ " print(\"Shape : \", df.shape)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def remove_horodates(df):\n",
+ " \"\"\"\n",
+ " this function remove horodate columns like created_at and updated_at\n",
+ " \"\"\"\n",
+ " df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def order_columns_id(df):\n",
+ " \"\"\"\n",
+ " this function puts all id columns at the beginning in order to read the dataset easier\n",
+ " \"\"\"\n",
+ " substring = 'id'\n",
+ " id_columns = [col for col in df.columns if substring in col]\n",
+ " remaining_col = [col for col in df.columns if substring not in col]\n",
+ " new_order = id_columns + remaining_col\n",
+ " return df[new_order]\n",
+ "\n",
+ "\n",
+ "def process_df_2(df):\n",
+ " \"\"\"\n",
+ " This function organizes dataframe\n",
+ " \"\"\"\n",
+ " df = remove_horodates(df)\n",
+ " print(\"Number of columns : \", len(df.columns))\n",
+ " df = order_columns_id(df)\n",
+ " print(\"Columns : \", df.columns)\n",
+ " return df\n",
+ "\n",
+ "def load_dataset(name):\n",
+ " \"\"\"\n",
+ " This function loads csv file\n",
+ " \"\"\"\n",
+ " df = display_databases(name)\n",
+ " df = process_df_2(df)\n",
+ " # drop na :\n",
+ " #df = df.dropna(axis=1, thresh=len(df))\n",
+ " # if identifier in table : delete it\n",
+ " if 'identifier' in df.columns:\n",
+ " df = df.drop(columns = 'identifier')\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd",
+ "metadata": {},
+ "source": [
+ "Create theme tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "350b09b9-451f-4d47-81fe-f34b892db027",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_products_table():\n",
+ " # first merge products and categories\n",
+ " print(\"first merge products and categories\")\n",
+ " products = load_dataset(\"1products.csv\")\n",
+ " categories = load_dataset(\"1categories.csv\")\n",
+ " # Drop useless columns\n",
+ " products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
+ " categories = categories.drop(columns = ['extra_field', 'quota'])\n",
+ "\n",
+ " #Merge\n",
+ " products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
+ " right_on = 'id', suffixes=('_products', '_categories'))\n",
+ " products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
+ " \n",
+ " # Second merge products_theme and type of categories\n",
+ " print(\"Second merge products_theme and type of categories\")\n",
+ " type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
+ " type_of_categories = type_of_categories.drop(columns = 'id')\n",
+ " products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
+ " right_on = 'category_id' )\n",
+ "\n",
+ " # Index cleaning\n",
+ " products_theme = products_theme.drop(columns = ['id_categories'])\n",
+ " products_theme = order_columns_id(products_theme)\n",
+ " return products_theme\n",
+ "\n",
+ "\n",
+ "def create_events_table():\n",
+ " # first merge events and seasons : \n",
+ " print(\"first merge events and seasons : \")\n",
+ " events = load_dataset(\"1events.csv\")\n",
+ " seasons = load_dataset(\"1seasons.csv\")\n",
+ "\n",
+ " # Drop useless columns\n",
+ " events = events.drop(columns = ['manual_added', 'is_display'])\n",
+ " seasons = seasons.drop(columns = ['start_date_time'])\n",
+ " \n",
+ " events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
+ "\n",
+ " # Secondly merge events_theme and event_types\n",
+ " print(\"Secondly merge events_theme and event_types : \")\n",
+ " event_types = load_dataset(\"1event_types.csv\")\n",
+ " event_types = event_types.drop(columns = ['fidelity_delay'])\n",
+ " \n",
+ " events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
+ " events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
+ " events_theme = events_theme.drop(columns = 'id')\n",
+ "\n",
+ " # thirdly merge events_theme and facilities\n",
+ " print(\"thirdly merge events_theme and facilities : \")\n",
+ " facilities = load_dataset(\"1facilities.csv\")\n",
+ " facilities = facilities.drop(columns = ['fixed_capacity'])\n",
+ " \n",
+ " events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
+ " events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
+ " events_theme = events_theme.drop(columns = 'id')\n",
+ "\n",
+ " # Index cleaning\n",
+ " events_theme = events_theme.drop(columns = ['id_seasons'])\n",
+ " events_theme = order_columns_id(events_theme)\n",
+ " return events_theme\n",
+ "\n",
+ "\n",
+ "def create_representations_table():\n",
+ " representations = load_dataset(\"1representations.csv\")\n",
+ " representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
+ " 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
+ " 'representation_type_id'])\n",
+ " \n",
+ " representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
+ " representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
+ "\n",
+ " representations_theme = representations.merge(representations_capacity, how='left',\n",
+ " left_on='id', right_on='representation_id',\n",
+ " suffixes=('_representation', '_representation_cap'))\n",
+ " # index cleaning\n",
+ " representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
+ " representations_theme = order_columns_id(representations_theme)\n",
+ " return representations_theme"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "0fccc8ef-e575-4857-a401-94a7274394df",
+ "metadata": {},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "first merge products and categories\n",
+ "File path : bdc2324-data/1/1products.csv\n",
+ "Shape : (94803, 14)\n",
+ "Number of columns : 12\n",
+ "Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
+ " 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
+ " 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
+ " dtype='object')\n",
+ "File path : bdc2324-data/1/1categories.csv\n",
+ "Shape : (27, 7)\n",
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
+ "Second merge products_theme and type of categories\n",
+ "File path : bdc2324-data/1/1type_of_categories.csv\n",
+ "Shape : (5, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -1312,133 +680,673 @@
" \n",
" \n",
" | \n",
- " customer_id | \n",
- " nb_campaigns | \n",
- " nb_campaigns_opened | \n",
- " time_to_open | \n",
+ " id_products | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " category_id | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " type_of_id | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " name_categories | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
- " 2 | \n",
- " 4 | \n",
- " 0.0 | \n",
- " NaT | \n",
+ " 10682 | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 41 | \n",
+ " 10655 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " indiv activité tr | \n",
"
\n",
" \n",
" 1 | \n",
- " 3 | \n",
- " 222 | \n",
- " 124.0 | \n",
- " 1 days 00:28:30.169354838 | \n",
+ " 478 | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 1 | \n",
+ " 471 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
"
\n",
" \n",
" 2 | \n",
- " 4 | \n",
- " 7 | \n",
- " 7.0 | \n",
- " 1 days 04:31:01.428571428 | \n",
+ " 20873 | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 1 | \n",
+ " 20825 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
"
\n",
" \n",
" 3 | \n",
+ " 157142 | \n",
+ " 82519 | \n",
+ " 9 | \n",
" 5 | \n",
- " 4 | \n",
- " 0.0 | \n",
- " NaT | \n",
+ " 156773 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " indiv entrées tr | \n",
"
\n",
" \n",
" 4 | \n",
- " 6 | \n",
- " 20 | \n",
- " 0.0 | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 130467 | \n",
- " 1256097 | \n",
+ " 1341 | \n",
+ " 9 | \n",
+ " 93 | \n",
" 1 | \n",
- " 1.0 | \n",
- " 0 days 02:11:15 | \n",
- "
\n",
- " \n",
- " 130468 | \n",
- " 1256098 | \n",
+ " 1175 | \n",
" 1 | \n",
- " 0.0 | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " 130469 | \n",
- " 1256099 | \n",
- " 1 | \n",
- " 0.0 | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " 130470 | \n",
- " 1256100 | \n",
- " 1 | \n",
- " 0.0 | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " 130471 | \n",
- " 1256101 | \n",
- " 1 | \n",
- " 0.0 | \n",
- " NaT | \n",
+ " 12.0 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
"
\n",
" \n",
"\n",
- "130472 rows × 4 columns
\n",
""
],
"text/plain": [
- " customer_id nb_campaigns nb_campaigns_opened \\\n",
- "0 2 4 0.0 \n",
- "1 3 222 124.0 \n",
- "2 4 7 7.0 \n",
- "3 5 4 0.0 \n",
- "4 6 20 0.0 \n",
- "... ... ... ... \n",
- "130467 1256097 1 1.0 \n",
- "130468 1256098 1 0.0 \n",
- "130469 1256099 1 0.0 \n",
- "130470 1256100 1 0.0 \n",
- "130471 1256101 1 0.0 \n",
+ " id_products representation_id pricing_formula_id category_id \\\n",
+ "0 10682 914 114 41 \n",
+ "1 478 273 131 1 \n",
+ "2 20873 275 137 1 \n",
+ "3 157142 82519 9 5 \n",
+ "4 1341 9 93 1 \n",
"\n",
- " time_to_open \n",
- "0 NaT \n",
- "1 1 days 00:28:30.169354838 \n",
- "2 1 days 04:31:01.428571428 \n",
- "3 NaT \n",
- "4 NaT \n",
- "... ... \n",
- "130467 0 days 02:11:15 \n",
- "130468 NaT \n",
- "130469 NaT \n",
- "130470 NaT \n",
- "130471 NaT \n",
+ " products_group_id product_pack_id type_of_id amount is_full_price \\\n",
+ "0 10655 1 NaN 9.0 False \n",
+ "1 471 1 12.0 9.5 False \n",
+ "2 20825 1 12.0 11.5 False \n",
+ "3 156773 1 NaN 8.0 False \n",
+ "4 1175 1 12.0 8.5 False \n",
"\n",
- "[130472 rows x 4 columns]"
+ " name_categories \n",
+ "0 indiv activité tr \n",
+ "1 indiv entrées tp \n",
+ "2 indiv entrées tp \n",
+ "3 indiv entrées tr \n",
+ "4 indiv entrées tp "
]
},
- "execution_count": 66,
+ "execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df1_campaigns_kpi"
+ "products_theme = create_products_table()\n",
+ "products_theme.head()"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "779d8aaf-6668-4f66-8852-847304407ea3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "first merge events and seasons : \n",
+ "File path : bdc2324-data/1/1events.csv\n",
+ "Shape : (1232, 12)\n",
+ "Number of columns : 10\n",
+ "Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
+ " 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
+ " dtype='object')\n",
+ "File path : bdc2324-data/1/1seasons.csv\n",
+ "Shape : (13, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
+ "Secondly merge events_theme and event_types : \n",
+ "File path : bdc2324-data/1/1event_types.csv\n",
+ "Shape : (9, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
+ "thirdly merge events_theme and facilities : \n",
+ "File path : bdc2324-data/1/1facilities.csv\n",
+ "Shape : (2, 7)\n",
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " event_id | \n",
+ " season_id | \n",
+ " facility_id | \n",
+ " event_type_id | \n",
+ " event_type_key_id | \n",
+ " facility_key_id | \n",
+ " street_id | \n",
+ " name_events | \n",
+ " name_seasons | \n",
+ " name_event_types | \n",
+ " name_facilities | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 192 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " frontières | \n",
+ " 2018 | \n",
+ " spectacle vivant | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 30329 | \n",
+ " 2767 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " visite guidée une autre histoire du monde (1h00) | \n",
+ " 2023 | \n",
+ " offre muséale groupe | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 161 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " visite contée les chercheurs d'or indiv | \n",
+ " 2018 | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5957 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " we dreamt of utopia and we woke up screaming. | \n",
+ " 2021 | \n",
+ " spectacle vivant | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 8337 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " jeff koons épisodes 4 | \n",
+ " 2021 | \n",
+ " spectacle vivant | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " event_id season_id facility_id event_type_id event_type_key_id \\\n",
+ "0 192 16 1 4 4 \n",
+ "1 30329 2767 1 5 5 \n",
+ "2 161 16 1 2 2 \n",
+ "3 5957 582 1 4 4 \n",
+ "4 8337 582 1 4 4 \n",
+ "\n",
+ " facility_key_id street_id \\\n",
+ "0 1 1 \n",
+ "1 1 1 \n",
+ "2 1 1 \n",
+ "3 1 1 \n",
+ "4 1 1 \n",
+ "\n",
+ " name_events name_seasons \\\n",
+ "0 frontières 2018 \n",
+ "1 visite guidée une autre histoire du monde (1h00) 2023 \n",
+ "2 visite contée les chercheurs d'or indiv 2018 \n",
+ "3 we dreamt of utopia and we woke up screaming. 2021 \n",
+ "4 jeff koons épisodes 4 2021 \n",
+ "\n",
+ " name_event_types name_facilities \n",
+ "0 spectacle vivant mucem \n",
+ "1 offre muséale groupe mucem \n",
+ "2 offre muséale individuel mucem \n",
+ "3 spectacle vivant mucem \n",
+ "4 spectacle vivant mucem "
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "events_theme= create_events_table()\n",
+ "events_theme.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1representations.csv\n",
+ "Shape : (36095, 16)\n",
+ "Number of columns : 14\n",
+ "Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
+ " 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
+ " 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
+ " dtype='object')\n",
+ "File path : bdc2324-data/1/1representation_category_capacities.csv\n",
+ "Shape : (65241, 7)\n",
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
+ " 'max_filling'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " event_id | \n",
+ " id_representation_cap | \n",
+ " representation_id | \n",
+ " category_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 12384 | \n",
+ " 123058 | \n",
+ " 84820 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 37 | \n",
+ " 2514 | \n",
+ " 269 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 37 | \n",
+ " 384 | \n",
+ " 269 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 37 | \n",
+ " 2515 | \n",
+ " 269 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 37 | \n",
+ " 383 | \n",
+ " 269 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " event_id id_representation_cap representation_id category_id\n",
+ "0 12384 123058 84820 2\n",
+ "1 37 2514 269 2\n",
+ "2 37 384 269 5\n",
+ "3 37 2515 269 10\n",
+ "4 37 383 269 1"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "representation_theme = create_representations_table()\n",
+ "representation_theme.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a",
+ "metadata": {},
+ "source": [
+ "Create uniform product database "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def uniform_product_df():\n",
+ " \"\"\"\n",
+ " This function returns the uniform product dataset\n",
+ " \"\"\"\n",
+ " print(\"Products theme columns : \", products_theme.columns)\n",
+ " print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
+ " print(\"\\n Events theme columns : \", events_theme.columns)\n",
+ "\n",
+ " products_global = products_theme.merge(representation_theme, how='left',\n",
+ " on= [\"representation_id\", \"category_id\"])\n",
+ " \n",
+ " products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
+ " suffixes = (\"_representation\", \"_event\"))\n",
+ " \n",
+ " products_global = order_columns_id(products_global)\n",
+ "\n",
+ " # remove useless columns \n",
+ " products_global = products_global.drop(columns = ['type_of_id', 'name_events', 'name_seasons', 'name_categories'])\n",
+ " return products_global"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
+ " 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
+ " 'is_full_price', 'name_categories'],\n",
+ " dtype='object')\n",
+ "\n",
+ " Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n",
+ " 'category_id'],\n",
+ " dtype='object')\n",
+ "\n",
+ " Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
+ " 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
+ " 'name_seasons', 'name_event_types', 'name_facilities'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id_products | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " category_id | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " event_id | \n",
+ " id_representation_cap | \n",
+ " season_id | \n",
+ " facility_id | \n",
+ " event_type_id | \n",
+ " event_type_key_id | \n",
+ " facility_key_id | \n",
+ " street_id | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " name_event_types | \n",
+ " name_facilities | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10682 | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 41 | \n",
+ " 10655 | \n",
+ " 1 | \n",
+ " 132 | \n",
+ " 8789 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 478 | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 1 | \n",
+ " 471 | \n",
+ " 1 | \n",
+ " 37 | \n",
+ " 390 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20873 | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 1 | \n",
+ " 20825 | \n",
+ " 1 | \n",
+ " 37 | \n",
+ " 395 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 157142 | \n",
+ " 82519 | \n",
+ " 9 | \n",
+ " 5 | \n",
+ " 156773 | \n",
+ " 1 | \n",
+ " 12365 | \n",
+ " 120199 | \n",
+ " 1754 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1341 | \n",
+ " 9 | \n",
+ " 93 | \n",
+ " 1 | \n",
+ " 1175 | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 21 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " non défini | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id_products representation_id pricing_formula_id category_id \\\n",
+ "0 10682 914 114 41 \n",
+ "1 478 273 131 1 \n",
+ "2 20873 275 137 1 \n",
+ "3 157142 82519 9 5 \n",
+ "4 1341 9 93 1 \n",
+ "\n",
+ " products_group_id product_pack_id event_id id_representation_cap \\\n",
+ "0 10655 1 132 8789 \n",
+ "1 471 1 37 390 \n",
+ "2 20825 1 37 395 \n",
+ "3 156773 1 12365 120199 \n",
+ "4 1175 1 8 21 \n",
+ "\n",
+ " season_id facility_id event_type_id event_type_key_id facility_key_id \\\n",
+ "0 4 1 2 5 1 \n",
+ "1 2 1 2 2 1 \n",
+ "2 2 1 2 2 1 \n",
+ "3 1754 1 2 4 1 \n",
+ "4 4 1 3 6 1 \n",
+ "\n",
+ " street_id amount is_full_price name_event_types name_facilities \n",
+ "0 1 9.0 False offre muséale individuel mucem \n",
+ "1 1 9.5 False offre muséale individuel mucem \n",
+ "2 1 11.5 False offre muséale individuel mucem \n",
+ "3 1 8.0 False offre muséale individuel mucem \n",
+ "4 1 8.5 False non défini mucem "
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products_global = uniform_product_df()\n",
+ "products_global.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "117d172a-2195-4060-9245-96c6f637ebbd",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {