diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb
index 3f3b639..a3018ba 100644
--- a/0_Cleaning_and_merge.ipynb
+++ b/0_Cleaning_and_merge.ipynb
@@ -10,7 +10,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 28,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
@@ -119,19 +119,10 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
- " df = pd.read_csv(file_in)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
@@ -158,7 +149,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
"metadata": {},
"outputs": [],
@@ -215,7 +206,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"id": "b95464b1-26bc-4aac-84b4-45da83b92251",
"metadata": {},
"outputs": [],
@@ -258,218 +249,20 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/1320335767.py:5: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
- "/tmp/ipykernel_50143/1320335767.py:9: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
]
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": null,
"id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " ticket_id | \n",
- " product_id | \n",
- " is_from_subscription | \n",
- " type_of | \n",
- " supplier_name | \n",
- " purchase_date | \n",
- " customer_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 13070859 | \n",
- " 225251 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 13070860 | \n",
- " 224914 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 13070861 | \n",
- " 224914 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 13070862 | \n",
- " 224914 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 13070863 | \n",
- " 224914 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2018-12-28 14:47:50+00:00 | \n",
- " 48187 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 1826667 | \n",
- " 20662815 | \n",
- " 405689 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 17:23:54+00:00 | \n",
- " 1256135 | \n",
- "
\n",
- " \n",
- " 1826668 | \n",
- " 20662816 | \n",
- " 403658 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 18:32:18+00:00 | \n",
- " 1256136 | \n",
- "
\n",
- " \n",
- " 1826669 | \n",
- " 20662817 | \n",
- " 403658 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 18:32:18+00:00 | \n",
- " 1256136 | \n",
- "
\n",
- " \n",
- " 1826670 | \n",
- " 20662818 | \n",
- " 403658 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 19:30:28+00:00 | \n",
- " 1256137 | \n",
- "
\n",
- " \n",
- " 1826671 | \n",
- " 20662819 | \n",
- " 403658 | \n",
- " False | \n",
- " 1 | \n",
- " vente en ligne | \n",
- " 2023-11-08 19:30:28+00:00 | \n",
- " 1256137 | \n",
- "
\n",
- " \n",
- "
\n",
- "
1826672 rows × 7 columns
\n",
- "
"
- ],
- "text/plain": [
- " ticket_id product_id is_from_subscription type_of supplier_name \\\n",
- "0 13070859 225251 False 1 vente en ligne \n",
- "1 13070860 224914 False 1 vente en ligne \n",
- "2 13070861 224914 False 1 vente en ligne \n",
- "3 13070862 224914 False 1 vente en ligne \n",
- "4 13070863 224914 False 1 vente en ligne \n",
- "... ... ... ... ... ... \n",
- "1826667 20662815 405689 False 1 vente en ligne \n",
- "1826668 20662816 403658 False 1 vente en ligne \n",
- "1826669 20662817 403658 False 1 vente en ligne \n",
- "1826670 20662818 403658 False 1 vente en ligne \n",
- "1826671 20662819 403658 False 1 vente en ligne \n",
- "\n",
- " purchase_date customer_id \n",
- "0 2018-12-28 14:47:50+00:00 48187 \n",
- "1 2018-12-28 14:47:50+00:00 48187 \n",
- "2 2018-12-28 14:47:50+00:00 48187 \n",
- "3 2018-12-28 14:47:50+00:00 48187 \n",
- "4 2018-12-28 14:47:50+00:00 48187 \n",
- "... ... ... \n",
- "1826667 2023-11-08 17:23:54+00:00 1256135 \n",
- "1826668 2023-11-08 18:32:18+00:00 1256136 \n",
- "1826669 2023-11-08 18:32:18+00:00 1256136 \n",
- "1826670 2023-11-08 19:30:28+00:00 1256137 \n",
- "1826671 2023-11-08 19:30:28+00:00 1256137 \n",
- "\n",
- "[1826672 rows x 7 columns]"
- ]
- },
- "execution_count": 70,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_ticket_information"
]
@@ -484,7 +277,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
"metadata": {},
"outputs": [],
@@ -513,413 +306,32 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/3848597476.py:4: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
]
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": null,
"id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " customer_id | \n",
- "
\n",
- " \n",
- " target_name | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " consentement optin mediation specialisee | \n",
- " 150000 | \n",
- "
\n",
- " \n",
- " consentement optin jeune public | \n",
- " 149979 | \n",
- "
\n",
- " \n",
- " consentement optin b2c | \n",
- " 108909 | \n",
- "
\n",
- " \n",
- " Arenametrix_bascule tel vers sib | \n",
- " 35216 | \n",
- "
\n",
- " \n",
- " consentement optout b2c | \n",
- " 34523 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " Automation_parrainage_newsletter_handicap_visuel | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " consentement optout mediation specialisee | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " Inscrits NL LSF formulaire | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " Market auto - contacts inactifs post-scénario | \n",
- " 1 | \n",
- "
\n",
- " \n",
- " Inactifs - fin du scénario | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
283 rows × 1 columns
\n",
- "
"
- ],
- "text/plain": [
- " customer_id\n",
- "target_name \n",
- "consentement optin mediation specialisee 150000\n",
- "consentement optin jeune public 149979\n",
- "consentement optin b2c 108909\n",
- "Arenametrix_bascule tel vers sib 35216\n",
- "consentement optout b2c 34523\n",
- "... ...\n",
- "Automation_parrainage_newsletter_handicap_visuel 1\n",
- "consentement optout mediation specialisee 1\n",
- "Inscrits NL LSF formulaire 1\n",
- "Market auto - contacts inactifs post-scénario 1\n",
- "Inactifs - fin du scénario 1\n",
- "\n",
- "[283 rows x 1 columns]"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False)"
]
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": null,
"id": "4417ff51-f501-4ab9-a192-4ab75764a8ed",
"metadata": {
"scrolled": true
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " customer_id | \n",
- "
\n",
- " \n",
- " target_name | \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " Arenametrix_bascule tel vers sib | \n",
- " 35216 | \n",
- "
\n",
- " \n",
- " Autres_interet_exposition | \n",
- " 1021 | \n",
- "
\n",
- " \n",
- " COM Inscrits NL générale (historique) | \n",
- " 23005 | \n",
- "
\n",
- " \n",
- " Contacts_prenomsdoubles | \n",
- " 11643 | \n",
- "
\n",
- " \n",
- " DDCP MD Procès du Siècle | \n",
- " 1684 | \n",
- "
\n",
- " \n",
- " DDCP Newsletter centres de loisirs | \n",
- " 1032 | \n",
- "
\n",
- " \n",
- " DDCP Newsletter enseignants | \n",
- " 4510 | \n",
- "
\n",
- " \n",
- " DDCP Newsletter jeune public | \n",
- " 3862 | \n",
- "
\n",
- " \n",
- " DDCP Newsletter relais champ social | \n",
- " 2270 | \n",
- "
\n",
- " \n",
- " DDCP PROMO Participants ateliers (adultes et enfants) | \n",
- " 1954 | \n",
- "
\n",
- " \n",
- " DDCP billets famille | \n",
- " 3609 | \n",
- "
\n",
- " \n",
- " DDCP promo MD pass musées dps oct 2018 | \n",
- " 1785 | \n",
- "
\n",
- " \n",
- " DDCP promo Plan B 2019 (concerts) | \n",
- " 1948 | \n",
- "
\n",
- " \n",
- " DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers) | \n",
- " 1293 | \n",
- "
\n",
- " \n",
- " DDCP rentrée culturelle 2023 | \n",
- " 1757 | \n",
- "
\n",
- " \n",
- " DDCP_marseille_jazz_2023 | \n",
- " 1043 | \n",
- "
\n",
- " \n",
- " DRE Festival Jean Rouch | \n",
- " 1502 | \n",
- "
\n",
- " \n",
- " DRE MucemLab | \n",
- " 2302 | \n",
- "
\n",
- " \n",
- " DRE chercheurs | \n",
- " 1557 | \n",
- "
\n",
- " \n",
- " DRE institutionnels | \n",
- " 2229 | \n",
- "
\n",
- " \n",
- " FORMATION _ acheteurs optin last year | \n",
- " 10485 | \n",
- "
\n",
- " \n",
- " Inscrits NL générale (export_291019 + operation_videomaton) | \n",
- " 14086 | \n",
- "
\n",
- " \n",
- " Inscrits NL générale site web | \n",
- " 3732 | \n",
- "
\n",
- " \n",
- " Inscrits NL jeune public site web | \n",
- " 1249 | \n",
- "
\n",
- " \n",
- " Votre première liste | \n",
- " 3715 | \n",
- "
\n",
- " \n",
- " consentement optin b2b | \n",
- " 12735 | \n",
- "
\n",
- " \n",
- " consentement optin b2c | \n",
- " 108909 | \n",
- "
\n",
- " \n",
- " consentement optin dre | \n",
- " 4527 | \n",
- "
\n",
- " \n",
- " consentement optin jeune public | \n",
- " 149979 | \n",
- "
\n",
- " \n",
- " consentement optin mediation specialisee | \n",
- " 150000 | \n",
- "
\n",
- " \n",
- " consentement optin newsletter generale | \n",
- " 22095 | \n",
- "
\n",
- " \n",
- " consentement optin scolaires | \n",
- " 4849 | \n",
- "
\n",
- " \n",
- " consentement optout b2b | \n",
- " 14219 | \n",
- "
\n",
- " \n",
- " consentement optout b2c | \n",
- " 34523 | \n",
- "
\n",
- " \n",
- " consentement optout dre | \n",
- " 14328 | \n",
- "
\n",
- " \n",
- " consentement optout newsletter generale | \n",
- " 18855 | \n",
- "
\n",
- " \n",
- " consentement optout scolaires | \n",
- " 15744 | \n",
- "
\n",
- " \n",
- " ddcp_md_scene_ouverte_au_talent | \n",
- " 1577 | \n",
- "
\n",
- " \n",
- " ddcp_promo_MD_billet_musée_oct_2019_agarder2 | \n",
- " 5482 | \n",
- "
\n",
- " \n",
- " ddcp_promo_md_musée_dps 011019 | \n",
- " 6010 | \n",
- "
\n",
- " \n",
- " ddcp_promo_visiteurs occasionnels_musee_8mois | \n",
- " 6640 | \n",
- "
\n",
- " \n",
- " ddcp_visiteurs dps 010622 | \n",
- " 12355 | \n",
- "
\n",
- " \n",
- " festival_jean_rouch | \n",
- " 1502 | \n",
- "
\n",
- " \n",
- " rappel po barvalo | \n",
- " 1248 | \n",
- "
\n",
- " \n",
- " structures_etiquette champ social | \n",
- " 1488 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " customer_id\n",
- "target_name \n",
- "Arenametrix_bascule tel vers sib 35216\n",
- "Autres_interet_exposition 1021\n",
- "COM Inscrits NL générale (historique) 23005\n",
- "Contacts_prenomsdoubles 11643\n",
- "DDCP MD Procès du Siècle 1684\n",
- "DDCP Newsletter centres de loisirs 1032\n",
- "DDCP Newsletter enseignants 4510\n",
- "DDCP Newsletter jeune public 3862\n",
- "DDCP Newsletter relais champ social 2270\n",
- "DDCP PROMO Participants ateliers (adultes et en... 1954\n",
- "DDCP billets famille 3609\n",
- "DDCP promo MD pass musées dps oct 2018 1785\n",
- "DDCP promo Plan B 2019 (concerts) 1948\n",
- "DDCP promo spectateurs prog 21-22 (spectacles, ... 1293\n",
- "DDCP rentrée culturelle 2023 1757\n",
- "DDCP_marseille_jazz_2023 1043\n",
- "DRE Festival Jean Rouch 1502\n",
- "DRE MucemLab 2302\n",
- "DRE chercheurs 1557\n",
- "DRE institutionnels 2229\n",
- "FORMATION _ acheteurs optin last year 10485\n",
- "Inscrits NL générale (export_291019 + operation... 14086\n",
- "Inscrits NL générale site web 3732\n",
- "Inscrits NL jeune public site web 1249\n",
- "Votre première liste 3715\n",
- "consentement optin b2b 12735\n",
- "consentement optin b2c 108909\n",
- "consentement optin dre 4527\n",
- "consentement optin jeune public 149979\n",
- "consentement optin mediation specialisee 150000\n",
- "consentement optin newsletter generale 22095\n",
- "consentement optin scolaires 4849\n",
- "consentement optout b2b 14219\n",
- "consentement optout b2c 34523\n",
- "consentement optout dre 14328\n",
- "consentement optout newsletter generale 18855\n",
- "consentement optout scolaires 15744\n",
- "ddcp_md_scene_ouverte_au_talent 1577\n",
- "ddcp_promo_MD_billet_musée_oct_2019_agarder2 5482\n",
- "ddcp_promo_md_musée_dps 011019 6010\n",
- "ddcp_promo_visiteurs occasionnels_musee_8mois 6640\n",
- "ddcp_visiteurs dps 010622 12355\n",
- "festival_jean_rouch 1502\n",
- "rappel po barvalo 1248\n",
- "structures_etiquette champ social 1488"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n",
"df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000]"
@@ -935,7 +347,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
"metadata": {},
"outputs": [],
@@ -960,271 +372,27 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
- "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
- "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame.\n",
- "Try using .loc[row_indexer,col_indexer] = value instead\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
]
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": null,
"id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " customer_id | \n",
- " opened_at | \n",
- " sent_at | \n",
- " delivered_at | \n",
- " campaign_name | \n",
- " campaign_service_id | \n",
- " campaign_sent_at | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 19793 | \n",
- " 112597 | \n",
- " NaT | \n",
- " 2021-03-28 16:01:09+00:00 | \n",
- " 2021-03-28 16:24:18+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 14211 | \n",
- " 113666 | \n",
- " NaT | \n",
- " 2021-03-28 16:01:09+00:00 | \n",
- " 2021-03-28 16:21:02+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 13150 | \n",
- " 280561 | \n",
- " NaT | \n",
- " 2021-03-28 16:00:59+00:00 | \n",
- " 2021-03-28 16:08:45+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 7073 | \n",
- " 101007 | \n",
- " 2021-03-28 18:11:06+00:00 | \n",
- " 2021-03-28 16:00:59+00:00 | \n",
- " 2021-03-28 16:09:47+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 5175 | \n",
- " 103972 | \n",
- " NaT | \n",
- " 2021-03-28 16:01:06+00:00 | \n",
- " 2021-03-28 16:05:03+00:00 | \n",
- " Le Mucem chez vous, gardons le lien #22 | \n",
- " 404 | \n",
- " 2021-03-27 23:00:00+00:00 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 6214803 | \n",
- " 8302994 | \n",
- " 266155 | \n",
- " 2023-10-23 09:43:25+00:00 | \n",
- " 2023-10-23 09:32:33+00:00 | \n",
- " 2023-10-23 09:32:34+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- " 6214804 | \n",
- " 8303307 | \n",
- " 21355 | \n",
- " 2023-10-23 09:44:02+00:00 | \n",
- " 2023-10-23 09:32:49+00:00 | \n",
- " 2023-10-23 09:32:49+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- " 6214805 | \n",
- " 8304346 | \n",
- " 21849 | \n",
- " 2023-10-23 09:45:52+00:00 | \n",
- " 2023-10-23 09:33:28+00:00 | \n",
- " 2023-10-23 09:33:29+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- " 6214806 | \n",
- " 8302037 | \n",
- " 667789 | \n",
- " 2023-10-23 09:47:32+00:00 | \n",
- " 2023-10-23 09:31:53+00:00 | \n",
- " 2023-10-23 09:31:54+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- " 6214807 | \n",
- " 8304939 | \n",
- " 294154 | \n",
- " NaT | \n",
- " 2023-10-23 09:33:54+00:00 | \n",
- " 2023-10-23 09:33:55+00:00 | \n",
- " dre_nov_2023 | \n",
- " 1318 | \n",
- " 2023-10-23 09:31:17+00:00 | \n",
- "
\n",
- " \n",
- "
\n",
- "
6214808 rows × 8 columns
\n",
- "
"
- ],
- "text/plain": [
- " id customer_id opened_at \\\n",
- "0 19793 112597 NaT \n",
- "1 14211 113666 NaT \n",
- "2 13150 280561 NaT \n",
- "3 7073 101007 2021-03-28 18:11:06+00:00 \n",
- "4 5175 103972 NaT \n",
- "... ... ... ... \n",
- "6214803 8302994 266155 2023-10-23 09:43:25+00:00 \n",
- "6214804 8303307 21355 2023-10-23 09:44:02+00:00 \n",
- "6214805 8304346 21849 2023-10-23 09:45:52+00:00 \n",
- "6214806 8302037 667789 2023-10-23 09:47:32+00:00 \n",
- "6214807 8304939 294154 NaT \n",
- "\n",
- " sent_at delivered_at \\\n",
- "0 2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00 \n",
- "1 2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00 \n",
- "2 2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00 \n",
- "3 2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00 \n",
- "4 2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00 \n",
- "... ... ... \n",
- "6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00 \n",
- "6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00 \n",
- "6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00 \n",
- "6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00 \n",
- "6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00 \n",
- "\n",
- " campaign_name campaign_service_id \\\n",
- "0 Le Mucem chez vous, gardons le lien #22 404 \n",
- "1 Le Mucem chez vous, gardons le lien #22 404 \n",
- "2 Le Mucem chez vous, gardons le lien #22 404 \n",
- "3 Le Mucem chez vous, gardons le lien #22 404 \n",
- "4 Le Mucem chez vous, gardons le lien #22 404 \n",
- "... ... ... \n",
- "6214803 dre_nov_2023 1318 \n",
- "6214804 dre_nov_2023 1318 \n",
- "6214805 dre_nov_2023 1318 \n",
- "6214806 dre_nov_2023 1318 \n",
- "6214807 dre_nov_2023 1318 \n",
- "\n",
- " campaign_sent_at \n",
- "0 2021-03-27 23:00:00+00:00 \n",
- "1 2021-03-27 23:00:00+00:00 \n",
- "2 2021-03-27 23:00:00+00:00 \n",
- "3 2021-03-27 23:00:00+00:00 \n",
- "4 2021-03-27 23:00:00+00:00 \n",
- "... ... \n",
- "6214803 2023-10-23 09:31:17+00:00 \n",
- "6214804 2023-10-23 09:31:17+00:00 \n",
- "6214805 2023-10-23 09:31:17+00:00 \n",
- "6214806 2023-10-23 09:31:17+00:00 \n",
- "6214807 2023-10-23 09:31:17+00:00 \n",
- "\n",
- "[6214808 rows x 8 columns]"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "code",
- "execution_count": 67,
+ "execution_count": null,
"id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
"metadata": {},
"outputs": [],
@@ -1258,39 +426,239 @@
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": null,
"id": "24537647-bc29-4777-9848-ac4120a4aa60",
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_50143/2679359833.py:11: SettingWithCopyWarning: \n",
- "A value is trying to be set on a copy of a slice from a DataFrame\n",
- "\n",
- "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
- " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
- "/tmp/ipykernel_50143/2679359833.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
- "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
- "\n",
- "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
- "\n",
- "\n",
- " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"df1_campaigns_kpi = campaigns_kpi(campaigns_information = df1_campaigns_information) "
]
},
{
"cell_type": "code",
- "execution_count": 66,
+ "execution_count": null,
"id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
"metadata": {},
+ "outputs": [],
+ "source": [
+ "df1_campaigns_kpi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "56520a97-ede8-4920-a211-3b5b136af33d",
+ "metadata": {},
+ "source": [
+ "## Create Products Table"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9782e9d3-ba20-46bf-8562-bd0969972ddc",
+ "metadata": {},
+ "source": [
+ "Some useful functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "30488a40-1b38-4b9a-9d3b-26a0597c5e6d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "BUCKET = \"bdc2324-data\"\n",
+ "directory_path = '1'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "607eb4b4-eed9-4b50-b823-f75c116dd37c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def display_databases(file_name):\n",
+ " \"\"\"\n",
+ " This function returns the file from s3 storage\n",
+ " \"\"\"\n",
+ " file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
+ " print(\"File path : \", file_path)\n",
+ " with fs.open(file_path, mode=\"rb\") as file_in:\n",
+ " df = pd.read_csv(file_in, sep=\",\")\n",
+ " \n",
+ " print(\"Shape : \", df.shape)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def remove_horodates(df):\n",
+ " \"\"\"\n",
+ " this function remove horodate columns like created_at and updated_at\n",
+ " \"\"\"\n",
+ " df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def order_columns_id(df):\n",
+ " \"\"\"\n",
+ " this function puts all id columns at the beginning in order to read the dataset easier\n",
+ " \"\"\"\n",
+ " substring = 'id'\n",
+ " id_columns = [col for col in df.columns if substring in col]\n",
+ " remaining_col = [col for col in df.columns if substring not in col]\n",
+ " new_order = id_columns + remaining_col\n",
+ " return df[new_order]\n",
+ "\n",
+ "\n",
+ "def process_df_2(df):\n",
+ " \"\"\"\n",
+ " This function organizes dataframe\n",
+ " \"\"\"\n",
+ " df = remove_horodates(df)\n",
+ " print(\"Number of columns : \", len(df.columns))\n",
+ " df = order_columns_id(df)\n",
+ " print(\"Columns : \", df.columns)\n",
+ " return df\n",
+ "\n",
+ "def load_dataset(name):\n",
+ " \"\"\"\n",
+ " This function loads csv file\n",
+ " \"\"\"\n",
+ " df = display_databases(name)\n",
+ " df = process_df_2(df)\n",
+ " # drop na :\n",
+ " #df = df.dropna(axis=1, thresh=len(df))\n",
+ " # if identifier in table : delete it\n",
+ " if 'identifier' in df.columns:\n",
+ " df = df.drop(columns = 'identifier')\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d23f28c0-bc95-438b-8d14-5b7bb6e267bd",
+ "metadata": {},
+ "source": [
+ "Create theme tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "350b09b9-451f-4d47-81fe-f34b892db027",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_products_table():\n",
+ " # first merge products and categories\n",
+ " print(\"first merge products and categories\")\n",
+ " products = load_dataset(\"1products.csv\")\n",
+ " categories = load_dataset(\"1categories.csv\")\n",
+ " # Drop useless columns\n",
+ " products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
+ " categories = categories.drop(columns = ['extra_field', 'quota'])\n",
+ "\n",
+ " #Merge\n",
+ " products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
+ " right_on = 'id', suffixes=('_products', '_categories'))\n",
+ " products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
+ " \n",
+ " # Second merge products_theme and type of categories\n",
+ " print(\"Second merge products_theme and type of categories\")\n",
+ " type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
+ " type_of_categories = type_of_categories.drop(columns = 'id')\n",
+ " products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
+ " right_on = 'category_id' )\n",
+ "\n",
+ " # Index cleaning\n",
+ " products_theme = products_theme.drop(columns = ['id_categories'])\n",
+ " products_theme = order_columns_id(products_theme)\n",
+ " return products_theme\n",
+ "\n",
+ "\n",
+ "def create_events_table():\n",
+ " # first merge events and seasons : \n",
+ " print(\"first merge events and seasons : \")\n",
+ " events = load_dataset(\"1events.csv\")\n",
+ " seasons = load_dataset(\"1seasons.csv\")\n",
+ "\n",
+ " # Drop useless columns\n",
+ " events = events.drop(columns = ['manual_added', 'is_display'])\n",
+ " seasons = seasons.drop(columns = ['start_date_time'])\n",
+ " \n",
+ " events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
+ "\n",
+ " # Secondly merge events_theme and event_types\n",
+ " print(\"Secondly merge events_theme and event_types : \")\n",
+ " event_types = load_dataset(\"1event_types.csv\")\n",
+ " event_types = event_types.drop(columns = ['fidelity_delay'])\n",
+ " \n",
+ " events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
+ " events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
+ " events_theme = events_theme.drop(columns = 'id')\n",
+ "\n",
+ " # thirdly merge events_theme and facilities\n",
+ " print(\"thirdly merge events_theme and facilities : \")\n",
+ " facilities = load_dataset(\"1facilities.csv\")\n",
+ " facilities = facilities.drop(columns = ['fixed_capacity'])\n",
+ " \n",
+ " events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
+ " events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
+ " events_theme = events_theme.drop(columns = 'id')\n",
+ "\n",
+ " # Index cleaning\n",
+ " events_theme = events_theme.drop(columns = ['id_seasons'])\n",
+ " events_theme = order_columns_id(events_theme)\n",
+ " return events_theme\n",
+ "\n",
+ "\n",
+ "def create_representations_table():\n",
+ " representations = load_dataset(\"1representations.csv\")\n",
+ " representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
+ " 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
+ " 'representation_type_id'])\n",
+ " \n",
+ " representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
+ " representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
+ "\n",
+ " representations_theme = representations.merge(representations_capacity, how='left',\n",
+ " left_on='id', right_on='representation_id',\n",
+ " suffixes=('_representation', '_representation_cap'))\n",
+ " # index cleaning\n",
+ " representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
+ " representations_theme = order_columns_id(representations_theme)\n",
+ " return representations_theme"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "0fccc8ef-e575-4857-a401-94a7274394df",
+ "metadata": {},
"outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "first merge products and categories\n",
+ "File path : bdc2324-data/1/1products.csv\n",
+ "Shape : (94803, 14)\n",
+ "Number of columns : 12\n",
+ "Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
+ " 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
+ " 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
+ " dtype='object')\n",
+ "File path : bdc2324-data/1/1categories.csv\n",
+ "Shape : (27, 7)\n",
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
+ "Second merge products_theme and type of categories\n",
+ "File path : bdc2324-data/1/1type_of_categories.csv\n",
+ "Shape : (5, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
+ ]
+ },
{
"data": {
"text/html": [
@@ -1312,133 +680,673 @@
" \n",
" \n",
" | \n",
- " customer_id | \n",
- " nb_campaigns | \n",
- " nb_campaigns_opened | \n",
- " time_to_open | \n",
+ " id_products | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " category_id | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " type_of_id | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " name_categories | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
- " 2 | \n",
- " 4 | \n",
- " 0.0 | \n",
- " NaT | \n",
+ " 10682 | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 41 | \n",
+ " 10655 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " indiv activité tr | \n",
"
\n",
" \n",
" 1 | \n",
- " 3 | \n",
- " 222 | \n",
- " 124.0 | \n",
- " 1 days 00:28:30.169354838 | \n",
+ " 478 | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 1 | \n",
+ " 471 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
"
\n",
" \n",
" 2 | \n",
- " 4 | \n",
- " 7 | \n",
- " 7.0 | \n",
- " 1 days 04:31:01.428571428 | \n",
+ " 20873 | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 1 | \n",
+ " 20825 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
"
\n",
" \n",
" 3 | \n",
+ " 157142 | \n",
+ " 82519 | \n",
+ " 9 | \n",
" 5 | \n",
- " 4 | \n",
- " 0.0 | \n",
- " NaT | \n",
+ " 156773 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " indiv entrées tr | \n",
"
\n",
" \n",
" 4 | \n",
- " 6 | \n",
- " 20 | \n",
- " 0.0 | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 130467 | \n",
- " 1256097 | \n",
+ " 1341 | \n",
+ " 9 | \n",
+ " 93 | \n",
" 1 | \n",
- " 1.0 | \n",
- " 0 days 02:11:15 | \n",
- "
\n",
- " \n",
- " 130468 | \n",
- " 1256098 | \n",
+ " 1175 | \n",
" 1 | \n",
- " 0.0 | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " 130469 | \n",
- " 1256099 | \n",
- " 1 | \n",
- " 0.0 | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " 130470 | \n",
- " 1256100 | \n",
- " 1 | \n",
- " 0.0 | \n",
- " NaT | \n",
- "
\n",
- " \n",
- " 130471 | \n",
- " 1256101 | \n",
- " 1 | \n",
- " 0.0 | \n",
- " NaT | \n",
+ " 12.0 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
"
\n",
" \n",
"\n",
- "130472 rows × 4 columns
\n",
""
],
"text/plain": [
- " customer_id nb_campaigns nb_campaigns_opened \\\n",
- "0 2 4 0.0 \n",
- "1 3 222 124.0 \n",
- "2 4 7 7.0 \n",
- "3 5 4 0.0 \n",
- "4 6 20 0.0 \n",
- "... ... ... ... \n",
- "130467 1256097 1 1.0 \n",
- "130468 1256098 1 0.0 \n",
- "130469 1256099 1 0.0 \n",
- "130470 1256100 1 0.0 \n",
- "130471 1256101 1 0.0 \n",
+ " id_products representation_id pricing_formula_id category_id \\\n",
+ "0 10682 914 114 41 \n",
+ "1 478 273 131 1 \n",
+ "2 20873 275 137 1 \n",
+ "3 157142 82519 9 5 \n",
+ "4 1341 9 93 1 \n",
"\n",
- " time_to_open \n",
- "0 NaT \n",
- "1 1 days 00:28:30.169354838 \n",
- "2 1 days 04:31:01.428571428 \n",
- "3 NaT \n",
- "4 NaT \n",
- "... ... \n",
- "130467 0 days 02:11:15 \n",
- "130468 NaT \n",
- "130469 NaT \n",
- "130470 NaT \n",
- "130471 NaT \n",
+ " products_group_id product_pack_id type_of_id amount is_full_price \\\n",
+ "0 10655 1 NaN 9.0 False \n",
+ "1 471 1 12.0 9.5 False \n",
+ "2 20825 1 12.0 11.5 False \n",
+ "3 156773 1 NaN 8.0 False \n",
+ "4 1175 1 12.0 8.5 False \n",
"\n",
- "[130472 rows x 4 columns]"
+ " name_categories \n",
+ "0 indiv activité tr \n",
+ "1 indiv entrées tp \n",
+ "2 indiv entrées tp \n",
+ "3 indiv entrées tr \n",
+ "4 indiv entrées tp "
]
},
- "execution_count": 66,
+ "execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df1_campaigns_kpi"
+ "products_theme = create_products_table()\n",
+ "products_theme.head()"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "779d8aaf-6668-4f66-8852-847304407ea3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "first merge events and seasons : \n",
+ "File path : bdc2324-data/1/1events.csv\n",
+ "Shape : (1232, 12)\n",
+ "Number of columns : 10\n",
+ "Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
+ " 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
+ " dtype='object')\n",
+ "File path : bdc2324-data/1/1seasons.csv\n",
+ "Shape : (13, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
+ "Secondly merge events_theme and event_types : \n",
+ "File path : bdc2324-data/1/1event_types.csv\n",
+ "Shape : (9, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
+ "thirdly merge events_theme and facilities : \n",
+ "File path : bdc2324-data/1/1facilities.csv\n",
+ "Shape : (2, 7)\n",
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " event_id | \n",
+ " season_id | \n",
+ " facility_id | \n",
+ " event_type_id | \n",
+ " event_type_key_id | \n",
+ " facility_key_id | \n",
+ " street_id | \n",
+ " name_events | \n",
+ " name_seasons | \n",
+ " name_event_types | \n",
+ " name_facilities | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 192 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " frontières | \n",
+ " 2018 | \n",
+ " spectacle vivant | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 30329 | \n",
+ " 2767 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " visite guidée une autre histoire du monde (1h00) | \n",
+ " 2023 | \n",
+ " offre muséale groupe | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 161 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " visite contée les chercheurs d'or indiv | \n",
+ " 2018 | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5957 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " we dreamt of utopia and we woke up screaming. | \n",
+ " 2021 | \n",
+ " spectacle vivant | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 8337 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " jeff koons épisodes 4 | \n",
+ " 2021 | \n",
+ " spectacle vivant | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " event_id season_id facility_id event_type_id event_type_key_id \\\n",
+ "0 192 16 1 4 4 \n",
+ "1 30329 2767 1 5 5 \n",
+ "2 161 16 1 2 2 \n",
+ "3 5957 582 1 4 4 \n",
+ "4 8337 582 1 4 4 \n",
+ "\n",
+ " facility_key_id street_id \\\n",
+ "0 1 1 \n",
+ "1 1 1 \n",
+ "2 1 1 \n",
+ "3 1 1 \n",
+ "4 1 1 \n",
+ "\n",
+ " name_events name_seasons \\\n",
+ "0 frontières 2018 \n",
+ "1 visite guidée une autre histoire du monde (1h00) 2023 \n",
+ "2 visite contée les chercheurs d'or indiv 2018 \n",
+ "3 we dreamt of utopia and we woke up screaming. 2021 \n",
+ "4 jeff koons épisodes 4 2021 \n",
+ "\n",
+ " name_event_types name_facilities \n",
+ "0 spectacle vivant mucem \n",
+ "1 offre muséale groupe mucem \n",
+ "2 offre muséale individuel mucem \n",
+ "3 spectacle vivant mucem \n",
+ "4 spectacle vivant mucem "
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "events_theme= create_events_table()\n",
+ "events_theme.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "7714fa32-303b-4ea7-b174-3fd0fcab5af0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1representations.csv\n",
+ "Shape : (36095, 16)\n",
+ "Number of columns : 14\n",
+ "Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
+ " 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
+ " 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
+ " dtype='object')\n",
+ "File path : bdc2324-data/1/1representation_category_capacities.csv\n",
+ "Shape : (65241, 7)\n",
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
+ " 'max_filling'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " event_id | \n",
+ " id_representation_cap | \n",
+ " representation_id | \n",
+ " category_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 12384 | \n",
+ " 123058 | \n",
+ " 84820 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 37 | \n",
+ " 2514 | \n",
+ " 269 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 37 | \n",
+ " 384 | \n",
+ " 269 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 37 | \n",
+ " 2515 | \n",
+ " 269 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 37 | \n",
+ " 383 | \n",
+ " 269 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " event_id id_representation_cap representation_id category_id\n",
+ "0 12384 123058 84820 2\n",
+ "1 37 2514 269 2\n",
+ "2 37 384 269 5\n",
+ "3 37 2515 269 10\n",
+ "4 37 383 269 1"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "representation_theme = create_representations_table()\n",
+ "representation_theme.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8fa191d5-c867-4d4d-bbab-f29d7d91ce6a",
+ "metadata": {},
+ "source": [
+ "Create uniform product database "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "15a62ed6-35e4-4abc-aeef-a7daeec0a4ba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def uniform_product_df():\n",
+ " \"\"\"\n",
+ " This function returns the uniform product dataset\n",
+ " \"\"\"\n",
+ " print(\"Products theme columns : \", products_theme.columns)\n",
+ " print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
+ " print(\"\\n Events theme columns : \", events_theme.columns)\n",
+ "\n",
+ " products_global = products_theme.merge(representation_theme, how='left',\n",
+ " on= [\"representation_id\", \"category_id\"])\n",
+ " \n",
+ " products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
+ " suffixes = (\"_representation\", \"_event\"))\n",
+ " \n",
+ " products_global = order_columns_id(products_global)\n",
+ "\n",
+ " # remove useless columns \n",
+ " products_global = products_global.drop(columns = ['type_of_id', 'name_events', 'name_seasons', 'name_categories'])\n",
+ " return products_global"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "89dc9685-1de9-4ce3-a6c0-8d7f1931a951",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
+ " 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
+ " 'is_full_price', 'name_categories'],\n",
+ " dtype='object')\n",
+ "\n",
+ " Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n",
+ " 'category_id'],\n",
+ " dtype='object')\n",
+ "\n",
+ " Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
+ " 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
+ " 'name_seasons', 'name_event_types', 'name_facilities'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id_products | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " category_id | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " event_id | \n",
+ " id_representation_cap | \n",
+ " season_id | \n",
+ " facility_id | \n",
+ " event_type_id | \n",
+ " event_type_key_id | \n",
+ " facility_key_id | \n",
+ " street_id | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " name_event_types | \n",
+ " name_facilities | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10682 | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 41 | \n",
+ " 10655 | \n",
+ " 1 | \n",
+ " 132 | \n",
+ " 8789 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 478 | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 1 | \n",
+ " 471 | \n",
+ " 1 | \n",
+ " 37 | \n",
+ " 390 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20873 | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 1 | \n",
+ " 20825 | \n",
+ " 1 | \n",
+ " 37 | \n",
+ " 395 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 157142 | \n",
+ " 82519 | \n",
+ " 9 | \n",
+ " 5 | \n",
+ " 156773 | \n",
+ " 1 | \n",
+ " 12365 | \n",
+ " 120199 | \n",
+ " 1754 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1341 | \n",
+ " 9 | \n",
+ " 93 | \n",
+ " 1 | \n",
+ " 1175 | \n",
+ " 1 | \n",
+ " 8 | \n",
+ " 21 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " non défini | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id_products representation_id pricing_formula_id category_id \\\n",
+ "0 10682 914 114 41 \n",
+ "1 478 273 131 1 \n",
+ "2 20873 275 137 1 \n",
+ "3 157142 82519 9 5 \n",
+ "4 1341 9 93 1 \n",
+ "\n",
+ " products_group_id product_pack_id event_id id_representation_cap \\\n",
+ "0 10655 1 132 8789 \n",
+ "1 471 1 37 390 \n",
+ "2 20825 1 37 395 \n",
+ "3 156773 1 12365 120199 \n",
+ "4 1175 1 8 21 \n",
+ "\n",
+ " season_id facility_id event_type_id event_type_key_id facility_key_id \\\n",
+ "0 4 1 2 5 1 \n",
+ "1 2 1 2 2 1 \n",
+ "2 2 1 2 2 1 \n",
+ "3 1754 1 2 4 1 \n",
+ "4 4 1 3 6 1 \n",
+ "\n",
+ " street_id amount is_full_price name_event_types name_facilities \n",
+ "0 1 9.0 False offre muséale individuel mucem \n",
+ "1 1 9.5 False offre muséale individuel mucem \n",
+ "2 1 11.5 False offre muséale individuel mucem \n",
+ "3 1 8.0 False offre muséale individuel mucem \n",
+ "4 1 8.5 False non défini mucem "
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products_global = uniform_product_df()\n",
+ "products_global.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "117d172a-2195-4060-9245-96c6f637ebbd",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/Notebook_AR.ipynb b/Notebook_AR.ipynb
index a3c291b..18b06d1 100644
--- a/Notebook_AR.ipynb
+++ b/Notebook_AR.ipynb
@@ -78,7 +78,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"id": "f1cce705-46e1-42de-8e93-2ee15312d288",
"metadata": {},
"outputs": [],
@@ -88,10 +88,43 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"id": "82d4db0e-0cd5-49af-a4d3-f17f54b1c03c",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "bdc2324-data/8/8campaign_stats.csv\n",
+ "bdc2324-data/8/8campaigns.csv\n",
+ "bdc2324-data/8/8categories.csv\n",
+ "bdc2324-data/8/8countries.csv\n",
+ "bdc2324-data/8/8currencies.csv\n",
+ "bdc2324-data/8/8customer_target_mappings.csv\n",
+ "bdc2324-data/8/8customersplus.csv\n",
+ "bdc2324-data/8/8event_types.csv\n",
+ "bdc2324-data/8/8events.csv\n",
+ "bdc2324-data/8/8facilities.csv\n",
+ "bdc2324-data/8/8link_stats.csv\n",
+ "bdc2324-data/8/8pricing_formulas.csv\n",
+ "bdc2324-data/8/8product_packs.csv\n",
+ "bdc2324-data/8/8products.csv\n",
+ "bdc2324-data/8/8products_groups.csv\n",
+ "bdc2324-data/8/8purchases.csv\n",
+ "bdc2324-data/8/8representation_category_capacities.csv\n",
+ "bdc2324-data/8/8representations.csv\n",
+ "bdc2324-data/8/8seasons.csv\n",
+ "bdc2324-data/8/8suppliers.csv\n",
+ "bdc2324-data/8/8target_types.csv\n",
+ "bdc2324-data/8/8targets.csv\n",
+ "bdc2324-data/8/8tickets.csv\n",
+ "bdc2324-data/8/8type_of_categories.csv\n",
+ "bdc2324-data/8/8type_of_pricing_formulas.csv\n",
+ "bdc2324-data/8/8type_ofs.csv\n"
+ ]
+ }
+ ],
"source": [
"# check the files in the directory\n",
"\n",
@@ -103,7 +136,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"id": "65cb38ad-52ae-4266-85d8-c47d81b00283",
"metadata": {},
"outputs": [],
@@ -132,10 +165,162 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"id": "0214d30d-5f83-498f-867f-e67b5793b731",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8campaigns.csv\n",
+ "Shape : (1689, 11)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " service_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " process_id | \n",
+ " report_url | \n",
+ " category | \n",
+ " to_be_synced | \n",
+ " identifier | \n",
+ " sent_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " #LOUSFP RELANCE P'TITS LOU | \n",
+ " 1436 | \n",
+ " 2022-02-01 15:22:53.564432+01:00 | \n",
+ " 2022-02-01 15:22:53.564432+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " False | \n",
+ " eaa32c96f620053cf442ad32258076b9 | \n",
+ " 2022-01-31 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " #LOUSFP BRASSERIE ACHETEURS | \n",
+ " 1435 | \n",
+ " 2022-02-01 15:22:53.572592+01:00 | \n",
+ " 2022-02-01 15:22:53.572592+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " False | \n",
+ " 1f3202d820180a39f736f20fce790de8 | \n",
+ " 2022-01-31 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " PRESSE. LOU/SF Paris - RDV et protocole | \n",
+ " 1433 | \n",
+ " 2022-02-01 15:22:53.578426+01:00 | \n",
+ " 2022-02-01 15:22:53.578426+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " False | \n",
+ " b069b3415151fa7217e870017374de7c | \n",
+ " 2022-01-31 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " #LOUSFP ÉTUDIANTS | \n",
+ " 1432 | \n",
+ " 2022-02-01 15:22:53.584235+01:00 | \n",
+ " 2022-02-01 15:22:53.584235+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " False | \n",
+ " 56468d5607a5aaf1604ff5e15593b003 | \n",
+ " 2022-01-27 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " #LOUSFP P'TITS LOU | \n",
+ " 1431 | \n",
+ " 2022-02-01 15:22:53.590187+01:00 | \n",
+ " 2022-02-01 15:22:53.590187+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " False | \n",
+ " e11943a6031a0e6114ae69c257617980 | \n",
+ " 2022-01-27 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name service_id \\\n",
+ "0 1 #LOUSFP RELANCE P'TITS LOU 1436 \n",
+ "1 2 #LOUSFP BRASSERIE ACHETEURS 1435 \n",
+ "2 3 PRESSE. LOU/SF Paris - RDV et protocole 1433 \n",
+ "3 4 #LOUSFP ÉTUDIANTS 1432 \n",
+ "4 5 #LOUSFP P'TITS LOU 1431 \n",
+ "\n",
+ " created_at updated_at \\\n",
+ "0 2022-02-01 15:22:53.564432+01:00 2022-02-01 15:22:53.564432+01:00 \n",
+ "1 2022-02-01 15:22:53.572592+01:00 2022-02-01 15:22:53.572592+01:00 \n",
+ "2 2022-02-01 15:22:53.578426+01:00 2022-02-01 15:22:53.578426+01:00 \n",
+ "3 2022-02-01 15:22:53.584235+01:00 2022-02-01 15:22:53.584235+01:00 \n",
+ "4 2022-02-01 15:22:53.590187+01:00 2022-02-01 15:22:53.590187+01:00 \n",
+ "\n",
+ " process_id report_url category to_be_synced \\\n",
+ "0 NaN NaN 0 False \n",
+ "1 NaN NaN 0 False \n",
+ "2 NaN NaN 0 False \n",
+ "3 NaN NaN 0 False \n",
+ "4 NaN NaN 0 False \n",
+ "\n",
+ " identifier sent_at \n",
+ "0 eaa32c96f620053cf442ad32258076b9 2022-01-31 00:00:00+01:00 \n",
+ "1 1f3202d820180a39f736f20fce790de8 2022-01-31 00:00:00+01:00 \n",
+ "2 b069b3415151fa7217e870017374de7c 2022-01-31 00:00:00+01:00 \n",
+ "3 56468d5607a5aaf1604ff5e15593b003 2022-01-27 00:00:00+01:00 \n",
+ "4 e11943a6031a0e6114ae69c257617980 2022-01-27 00:00:00+01:00 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"campaigns = display_databases(\"8campaigns.csv\")\n",
"campaigns.head()"
@@ -143,10 +328,137 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "e7982be4-2c42-4a91-be5a-329a999644cc",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8campaign_stats.csv\n",
+ "Shape : (2527083, 8)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " campaign_id | \n",
+ " customer_id | \n",
+ " opened_at | \n",
+ " sent_at | \n",
+ " delivered_at | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 161410 | \n",
+ " 2022-02-02 18:16:07+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2022-02-02 17:16:08.616899+01:00 | \n",
+ " 2022-02-02 17:16:08.623098+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 54228 | \n",
+ " 2022-02-02 18:18:11+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2022-02-02 17:18:12.030260+01:00 | \n",
+ " 2022-02-02 17:18:12.036606+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 6 | \n",
+ " 120794 | \n",
+ " 2022-02-02 18:18:58+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2022-02-02 17:19:00.129697+01:00 | \n",
+ " 2022-02-02 17:19:00.134704+01:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 467025 | \n",
+ " 2022-02-02 18:19:33+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2022-02-02 17:19:34.023492+01:00 | \n",
+ " 2022-02-02 17:19:34.027570+01:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 2 | \n",
+ " 142106 | \n",
+ " 2022-02-02 18:19:35+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2022-02-02 17:19:36.553321+01:00 | \n",
+ " 2022-02-02 17:19:36.557473+01:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id campaign_id customer_id opened_at sent_at \\\n",
+ "0 1 5 161410 2022-02-02 18:16:07+01:00 NaN \n",
+ "1 2 1 54228 2022-02-02 18:18:11+01:00 NaN \n",
+ "2 3 6 120794 2022-02-02 18:18:58+01:00 NaN \n",
+ "3 4 3 467025 2022-02-02 18:19:33+01:00 NaN \n",
+ "4 5 2 142106 2022-02-02 18:19:35+01:00 NaN \n",
+ "\n",
+ " delivered_at created_at \\\n",
+ "0 NaN 2022-02-02 17:16:08.616899+01:00 \n",
+ "1 NaN 2022-02-02 17:18:12.030260+01:00 \n",
+ "2 NaN 2022-02-02 17:19:00.129697+01:00 \n",
+ "3 NaN 2022-02-02 17:19:34.023492+01:00 \n",
+ "4 NaN 2022-02-02 17:19:36.553321+01:00 \n",
+ "\n",
+ " updated_at \n",
+ "0 2022-02-02 17:16:08.623098+01:00 \n",
+ "1 2022-02-02 17:18:12.036606+01:00 \n",
+ "2 2022-02-02 17:19:00.134704+01:00 \n",
+ "3 2022-02-02 17:19:34.027570+01:00 \n",
+ "4 2022-02-02 17:19:36.557473+01:00 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"campaign_stats = display_databases(\"8campaign_stats.csv\")\n",
"campaign_stats.head()"
@@ -170,10 +482,118 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"id": "e973575b-4ed6-4b23-8024-f383ac82e87c",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8link_stats.csv\n",
+ "Shape : (108461, 6)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " clicked_at | \n",
+ " link_id | \n",
+ " customer_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2022-02-02 18:33:17+01:00 | \n",
+ " 1 | \n",
+ " 62137 | \n",
+ " 2022-02-02 17:33:19.237759+01:00 | \n",
+ " 2022-02-02 17:33:19.237759+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2022-02-02 18:33:26+01:00 | \n",
+ " 1 | \n",
+ " 556048 | \n",
+ " 2022-02-02 17:33:28.101943+01:00 | \n",
+ " 2022-02-02 17:33:28.101943+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2022-02-02 18:33:49+01:00 | \n",
+ " 2 | \n",
+ " 194456 | \n",
+ " 2022-02-02 17:33:50.595125+01:00 | \n",
+ " 2022-02-02 17:33:50.595125+01:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 2022-02-02 18:34:19+01:00 | \n",
+ " 1 | \n",
+ " 194456 | \n",
+ " 2022-02-02 17:34:20.493986+01:00 | \n",
+ " 2022-02-02 17:34:20.493986+01:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 2022-02-02 18:34:21+01:00 | \n",
+ " 2 | \n",
+ " 21571 | \n",
+ " 2022-02-02 17:34:22.300427+01:00 | \n",
+ " 2022-02-02 17:34:22.300427+01:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id clicked_at link_id customer_id \\\n",
+ "0 1 2022-02-02 18:33:17+01:00 1 62137 \n",
+ "1 2 2022-02-02 18:33:26+01:00 1 556048 \n",
+ "2 3 2022-02-02 18:33:49+01:00 2 194456 \n",
+ "3 4 2022-02-02 18:34:19+01:00 1 194456 \n",
+ "4 5 2022-02-02 18:34:21+01:00 2 21571 \n",
+ "\n",
+ " created_at updated_at \n",
+ "0 2022-02-02 17:33:19.237759+01:00 2022-02-02 17:33:19.237759+01:00 \n",
+ "1 2022-02-02 17:33:28.101943+01:00 2022-02-02 17:33:28.101943+01:00 \n",
+ "2 2022-02-02 17:33:50.595125+01:00 2022-02-02 17:33:50.595125+01:00 \n",
+ "3 2022-02-02 17:34:20.493986+01:00 2022-02-02 17:34:20.493986+01:00 \n",
+ "4 2022-02-02 17:34:22.300427+01:00 2022-02-02 17:34:22.300427+01:00 "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"links_stats = display_databases(\"8link_stats.csv\")\n",
"links_stats.head()"
@@ -189,10 +609,239 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"id": "3b523575-c779-451c-a12e-a36fb4ad232c",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "bdc2324-data/8/8customersplus.csv\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_513/2210053343.py:5: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " customersplus = pd.read_csv(file_in, sep=\",\")\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " lastname | \n",
+ " firstname | \n",
+ " birthdate | \n",
+ " email | \n",
+ " street_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " civility | \n",
+ " is_partner | \n",
+ " ... | \n",
+ " preferred_category | \n",
+ " preferred_supplier | \n",
+ " preferred_formula | \n",
+ " purchase_count | \n",
+ " first_buying_date | \n",
+ " last_visiting_date | \n",
+ " zipcode | \n",
+ " country | \n",
+ " age | \n",
+ " tenant_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1411166 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email1411166 | \n",
+ " 1 | \n",
+ " 2022-12-19 15:03:39.419371+01:00 | \n",
+ " 2022-12-19 15:03:39.419371+01:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 1594 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 478498 | \n",
+ " lastname478498 | \n",
+ " firstname478498 | \n",
+ " NaN | \n",
+ " email478498 | \n",
+ " 339167 | \n",
+ " 2021-09-17 18:58:30.259053+02:00 | \n",
+ " 2023-06-28 15:25:24.146689+02:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1594 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 473678 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email473678 | \n",
+ " 339167 | \n",
+ " 2021-09-17 18:44:04.119713+02:00 | \n",
+ " 2021-09-17 18:44:04.124204+02:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1594 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 475026 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email475026 | \n",
+ " 339167 | \n",
+ " 2021-09-17 18:47:28.789618+02:00 | \n",
+ " 2021-09-17 18:47:28.793958+02:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1594 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 487146 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email487146 | \n",
+ " 339167 | \n",
+ " 2021-09-17 19:10:24.070460+02:00 | \n",
+ " 2021-09-17 19:10:24.076033+02:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1594 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 43 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id lastname firstname birthdate email \\\n",
+ "0 1411166 NaN NaN NaN email1411166 \n",
+ "1 478498 lastname478498 firstname478498 NaN email478498 \n",
+ "2 473678 NaN NaN NaN email473678 \n",
+ "3 475026 NaN NaN NaN email475026 \n",
+ "4 487146 NaN NaN NaN email487146 \n",
+ "\n",
+ " street_id created_at \\\n",
+ "0 1 2022-12-19 15:03:39.419371+01:00 \n",
+ "1 339167 2021-09-17 18:58:30.259053+02:00 \n",
+ "2 339167 2021-09-17 18:44:04.119713+02:00 \n",
+ "3 339167 2021-09-17 18:47:28.789618+02:00 \n",
+ "4 339167 2021-09-17 19:10:24.070460+02:00 \n",
+ "\n",
+ " updated_at civility is_partner ... \\\n",
+ "0 2022-12-19 15:03:39.419371+01:00 NaN False ... \n",
+ "1 2023-06-28 15:25:24.146689+02:00 NaN False ... \n",
+ "2 2021-09-17 18:44:04.124204+02:00 NaN False ... \n",
+ "3 2021-09-17 18:47:28.793958+02:00 NaN False ... \n",
+ "4 2021-09-17 19:10:24.076033+02:00 NaN False ... \n",
+ "\n",
+ " preferred_category preferred_supplier preferred_formula purchase_count \\\n",
+ "0 NaN NaN NaN 0 \n",
+ "1 NaN NaN NaN 0 \n",
+ "2 NaN NaN NaN 0 \n",
+ "3 NaN NaN NaN 0 \n",
+ "4 NaN NaN NaN 0 \n",
+ "\n",
+ " first_buying_date last_visiting_date zipcode country age tenant_id \n",
+ "0 NaN NaN NaN fr NaN 1594 \n",
+ "1 NaN NaN NaN NaN NaN 1594 \n",
+ "2 NaN NaN NaN NaN NaN 1594 \n",
+ "3 NaN NaN NaN NaN NaN 1594 \n",
+ "4 NaN NaN NaN NaN NaN 1594 \n",
+ "\n",
+ "[5 rows x 43 columns]"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"file_name = \"8customersplus.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
@@ -213,10 +862,19 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "87d801fc-d19a-4c45-9b21-9b6d7a8451fd",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "bdc2324-data/8/8structures.csv\n",
+ "No structures database\n"
+ ]
+ }
+ ],
"source": [
"file_name = \"8structures.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
@@ -246,10 +904,124 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"id": "b6e4c3ea-5ccf-4aec-bd2d-79a5a1194178",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "bdc2324-data/8/8customer_target_mappings.csv\n",
+ "Shape : (1449147, 7)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " customer_id | \n",
+ " target_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " name | \n",
+ " extra_field | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 460062 | \n",
+ " 68 | \n",
+ " 2021-09-17 20:20:24.562734+02:00 | \n",
+ " 2021-09-17 20:20:24.562734+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 460056 | \n",
+ " 68 | \n",
+ " 2021-09-17 20:20:24.610139+02:00 | \n",
+ " 2021-09-17 20:20:24.610139+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 460051 | \n",
+ " 65 | \n",
+ " 2021-09-17 20:20:24.641381+02:00 | \n",
+ " 2021-09-17 20:20:24.641381+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 460051 | \n",
+ " 66 | \n",
+ " 2021-09-17 20:20:24.672238+02:00 | \n",
+ " 2021-09-17 20:20:24.672238+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 460049 | \n",
+ " 71 | \n",
+ " 2021-09-17 20:20:24.703110+02:00 | \n",
+ " 2021-09-17 20:20:24.703110+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id customer_id target_id created_at \\\n",
+ "0 1 460062 68 2021-09-17 20:20:24.562734+02:00 \n",
+ "1 2 460056 68 2021-09-17 20:20:24.610139+02:00 \n",
+ "2 3 460051 65 2021-09-17 20:20:24.641381+02:00 \n",
+ "3 4 460051 66 2021-09-17 20:20:24.672238+02:00 \n",
+ "4 5 460049 71 2021-09-17 20:20:24.703110+02:00 \n",
+ "\n",
+ " updated_at name extra_field \n",
+ "0 2021-09-17 20:20:24.562734+02:00 NaN NaN \n",
+ "1 2021-09-17 20:20:24.610139+02:00 NaN NaN \n",
+ "2 2021-09-17 20:20:24.641381+02:00 NaN NaN \n",
+ "3 2021-09-17 20:20:24.672238+02:00 NaN NaN \n",
+ "4 2021-09-17 20:20:24.703110+02:00 NaN NaN "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"file_name = \"8customer_target_mappings.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
@@ -267,10 +1039,112 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"id": "6e81a35c-3c6f-403d-9ebd-e8399ecd4263",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "bdc2324-data/8/8targets.csv\n",
+ "Shape : (331, 5)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " target_type_id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " ÉTUDIANTS (OPÉ PANIERS) 21-22 | \n",
+ " 2021-09-17 18:10:40.879995+02:00 | \n",
+ " 2021-09-17 18:10:40.879995+02:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " EFFECTIF + STAFF 21-22 | \n",
+ " 2021-09-17 18:10:40.894758+02:00 | \n",
+ " 2021-09-17 18:10:40.894758+02:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " Acheteurs LOU / USAP | \n",
+ " 2021-09-17 18:10:40.911969+02:00 | \n",
+ " 2021-09-17 18:10:40.911969+02:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " Liste Compensation 21-22 | \n",
+ " 2021-09-17 18:10:40.928796+02:00 | \n",
+ " 2021-09-17 18:10:40.928796+02:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " Partenaires 21-22 | \n",
+ " 2021-09-17 18:10:40.945476+02:00 | \n",
+ " 2021-09-17 18:10:40.945476+02:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id target_type_id name \\\n",
+ "0 1 1 ÉTUDIANTS (OPÉ PANIERS) 21-22 \n",
+ "1 2 1 EFFECTIF + STAFF 21-22 \n",
+ "2 3 1 Acheteurs LOU / USAP \n",
+ "3 4 1 Liste Compensation 21-22 \n",
+ "4 5 1 Partenaires 21-22 \n",
+ "\n",
+ " created_at updated_at \n",
+ "0 2021-09-17 18:10:40.879995+02:00 2021-09-17 18:10:40.879995+02:00 \n",
+ "1 2021-09-17 18:10:40.894758+02:00 2021-09-17 18:10:40.894758+02:00 \n",
+ "2 2021-09-17 18:10:40.911969+02:00 2021-09-17 18:10:40.911969+02:00 \n",
+ "3 2021-09-17 18:10:40.928796+02:00 2021-09-17 18:10:40.928796+02:00 \n",
+ "4 2021-09-17 18:10:40.945476+02:00 2021-09-17 18:10:40.945476+02:00 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"file_name = \"8targets.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
@@ -288,10 +1162,107 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"id": "85696d74-3b2f-4368-9045-44db5322b60d",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "bdc2324-data/8/8target_types.csv\n",
+ "Shape : (4, 6)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " is_import | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " manual_static_filter | \n",
+ " 2021-09-17 18:10:40.864320+02:00 | \n",
+ " 2021-09-17 18:10:40.864320+02:00 | \n",
+ " e34e3aa838a6eb4c41df6ed4444b796a | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " False | \n",
+ " manual_dynamic_filter | \n",
+ " 2022-03-09 14:41:45.695407+01:00 | \n",
+ " 2022-03-09 14:41:45.695407+01:00 | \n",
+ " e0f4b8693184850fefd6d2a38f10584e | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ " 2022-04-01 17:02:49.588910+02:00 | \n",
+ " 2022-04-01 17:02:49.588910+02:00 | \n",
+ " fb27e81baa4debc6a4e1a8639c20e808 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " True | \n",
+ " manual_import | \n",
+ " 2022-05-06 14:26:01.923160+02:00 | \n",
+ " 2022-05-06 14:26:01.923160+02:00 | \n",
+ " 12213df2ce68a624e4c0070521437bac | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id is_import name created_at \\\n",
+ "0 1 NaN manual_static_filter 2021-09-17 18:10:40.864320+02:00 \n",
+ "1 2 False manual_dynamic_filter 2022-03-09 14:41:45.695407+01:00 \n",
+ "2 3 False manual_static_filter 2022-04-01 17:02:49.588910+02:00 \n",
+ "3 4 True manual_import 2022-05-06 14:26:01.923160+02:00 \n",
+ "\n",
+ " updated_at identifier \n",
+ "0 2021-09-17 18:10:40.864320+02:00 e34e3aa838a6eb4c41df6ed4444b796a \n",
+ "1 2022-03-09 14:41:45.695407+01:00 e0f4b8693184850fefd6d2a38f10584e \n",
+ "2 2022-04-01 17:02:49.588910+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n",
+ "3 2022-05-06 14:26:01.923160+02:00 12213df2ce68a624e4c0070521437bac "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"file_name = \"8target_types.csv\"\n",
"file_path = BUCKET + \"/\" + directory_path + \"/\" + file_name\n",
@@ -327,10 +1298,131 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"id": "7c57529b-2ffb-4039-9795-b27c6fbd54a4",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8purchases.csv\n",
+ "Shape : (975703, 7)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " purchase_date | \n",
+ " customer_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " number | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 119609 | \n",
+ " 2017-09-09 15:39:45.913000+02:00 | \n",
+ " 1149 | \n",
+ " 2021-06-29 21:52:21.816195+02:00 | \n",
+ " 2021-06-29 21:52:21.816195+02:00 | \n",
+ " 193416 | \n",
+ " f2956e2d53321317e7c15c1cb992156c | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 119610 | \n",
+ " 2017-09-09 15:39:46.033000+02:00 | \n",
+ " 1149 | \n",
+ " 2021-06-29 21:52:21.817846+02:00 | \n",
+ " 2021-06-29 21:52:21.817846+02:00 | \n",
+ " 193416 | \n",
+ " faabab441b2668a85bb484490b2166c3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5464 | \n",
+ " 2017-07-24 19:44:11.923000+02:00 | \n",
+ " 1251 | \n",
+ " 2021-06-29 21:33:45.604224+02:00 | \n",
+ " 2021-06-29 21:33:45.604224+02:00 | \n",
+ " 184354 | \n",
+ " f63c69fa585ce4f91681f0d9ebeb770f | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 119613 | \n",
+ " 2017-09-10 11:25:45.820000+02:00 | \n",
+ " 12558 | \n",
+ " 2021-06-29 21:52:21.822033+02:00 | \n",
+ " 2021-06-29 21:52:21.822033+02:00 | \n",
+ " 193462 | \n",
+ " ffce5fd8d2348eb6885d0ee9c7bd017c | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1422860 | \n",
+ " 2018-10-08 10:30:42.980000+02:00 | \n",
+ " 17935 | \n",
+ " 2021-07-16 04:20:55.347369+02:00 | \n",
+ " 2021-07-16 04:20:55.347369+02:00 | \n",
+ " 247459 | \n",
+ " 193e41eae8ee078537107a569c0426ef | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id purchase_date customer_id \\\n",
+ "0 119609 2017-09-09 15:39:45.913000+02:00 1149 \n",
+ "1 119610 2017-09-09 15:39:46.033000+02:00 1149 \n",
+ "2 5464 2017-07-24 19:44:11.923000+02:00 1251 \n",
+ "3 119613 2017-09-10 11:25:45.820000+02:00 12558 \n",
+ "4 1422860 2018-10-08 10:30:42.980000+02:00 17935 \n",
+ "\n",
+ " created_at updated_at number \\\n",
+ "0 2021-06-29 21:52:21.816195+02:00 2021-06-29 21:52:21.816195+02:00 193416 \n",
+ "1 2021-06-29 21:52:21.817846+02:00 2021-06-29 21:52:21.817846+02:00 193416 \n",
+ "2 2021-06-29 21:33:45.604224+02:00 2021-06-29 21:33:45.604224+02:00 184354 \n",
+ "3 2021-06-29 21:52:21.822033+02:00 2021-06-29 21:52:21.822033+02:00 193462 \n",
+ "4 2021-07-16 04:20:55.347369+02:00 2021-07-16 04:20:55.347369+02:00 247459 \n",
+ "\n",
+ " identifier \n",
+ "0 f2956e2d53321317e7c15c1cb992156c \n",
+ "1 faabab441b2668a85bb484490b2166c3 \n",
+ "2 f63c69fa585ce4f91681f0d9ebeb770f \n",
+ "3 ffce5fd8d2348eb6885d0ee9c7bd017c \n",
+ "4 193e41eae8ee078537107a569c0426ef "
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"purchases = display_databases(\"8purchases.csv\")\n",
"purchases.head()"
@@ -338,10 +1430,162 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"id": "903321fb-99f8-475d-b4a6-c70ec2efe190",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8tickets.csv\n",
+ "Shape : (2370152, 11)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " number | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " purchase_id | \n",
+ " product_id | \n",
+ " is_from_subscription | \n",
+ " type_of | \n",
+ " supplier_id | \n",
+ " barcode | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 254164 | \n",
+ " 193416_763837_650_688_326212 | \n",
+ " 2021-06-29 21:53:14.951871+02:00 | \n",
+ " 2021-06-29 21:53:14.951871+02:00 | \n",
+ " 119609 | \n",
+ " 3334 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 9ec3b5617fc54512acf131aa5fa26870 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 254165 | \n",
+ " 193416_763838_650_688_326236 | \n",
+ " 2021-06-29 21:53:14.953717+02:00 | \n",
+ " 2021-06-29 21:53:14.953717+02:00 | \n",
+ " 119610 | \n",
+ " 3334 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " b227c664e2574a919672683f5cc4c98e | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 254168 | \n",
+ " 193462_763921_649_687_305676 | \n",
+ " 2021-06-29 21:53:14.958207+02:00 | \n",
+ " 2021-06-29 21:53:14.958207+02:00 | \n",
+ " 119613 | \n",
+ " 3432 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 28ac507ad84a30993bdfc0996fd2476b | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 254169 | \n",
+ " 193462_763922_649_687_305653 | \n",
+ " 2021-06-29 21:53:14.959681+02:00 | \n",
+ " 2021-06-29 21:53:14.959681+02:00 | \n",
+ " 119614 | \n",
+ " 3268 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 131dbaeef23f5ac2271bf0266ce35476 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 254170 | \n",
+ " 193462_763923_649_687_305630 | \n",
+ " 2021-06-29 21:53:14.961157+02:00 | \n",
+ " 2021-06-29 21:53:14.961157+02:00 | \n",
+ " 119615 | \n",
+ " 3268 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 1a6342ad2c213b626aa55e5374cd661a | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id number created_at \\\n",
+ "0 254164 193416_763837_650_688_326212 2021-06-29 21:53:14.951871+02:00 \n",
+ "1 254165 193416_763838_650_688_326236 2021-06-29 21:53:14.953717+02:00 \n",
+ "2 254168 193462_763921_649_687_305676 2021-06-29 21:53:14.958207+02:00 \n",
+ "3 254169 193462_763922_649_687_305653 2021-06-29 21:53:14.959681+02:00 \n",
+ "4 254170 193462_763923_649_687_305630 2021-06-29 21:53:14.961157+02:00 \n",
+ "\n",
+ " updated_at purchase_id product_id \\\n",
+ "0 2021-06-29 21:53:14.951871+02:00 119609 3334 \n",
+ "1 2021-06-29 21:53:14.953717+02:00 119610 3334 \n",
+ "2 2021-06-29 21:53:14.958207+02:00 119613 3432 \n",
+ "3 2021-06-29 21:53:14.959681+02:00 119614 3268 \n",
+ "4 2021-06-29 21:53:14.961157+02:00 119615 3268 \n",
+ "\n",
+ " is_from_subscription type_of supplier_id barcode \\\n",
+ "0 False 1 2 NaN \n",
+ "1 False 1 2 NaN \n",
+ "2 False 1 2 NaN \n",
+ "3 False 1 2 NaN \n",
+ "4 False 1 2 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 9ec3b5617fc54512acf131aa5fa26870 \n",
+ "1 b227c664e2574a919672683f5cc4c98e \n",
+ "2 28ac507ad84a30993bdfc0996fd2476b \n",
+ "3 131dbaeef23f5ac2271bf0266ce35476 \n",
+ "4 1a6342ad2c213b626aa55e5374cd661a "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"tickets = display_databases(\"8tickets.csv\")\n",
"tickets.head()"
@@ -349,10 +1593,143 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"id": "243e6942-0233-4cd5-b32b-e005457131d2",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8suppliers.csv\n",
+ "Shape : (16, 9)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " manually_added | \n",
+ " label | \n",
+ " itr | \n",
+ " updated_at | \n",
+ " created_at | \n",
+ " commission | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 152 | \n",
+ " plateformeceweb | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2021-07-16 00:02:17.805193+02:00 | \n",
+ " 2021-07-16 00:02:17.805193+02:00 | \n",
+ " NaN | \n",
+ " 0fc934f49bfa9f1f4e6ab7e2593b6839 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6 | \n",
+ " accreditation annuelle | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2021-06-29 21:33:14.138349+02:00 | \n",
+ " 2021-06-29 21:33:14.138349+02:00 | \n",
+ " NaN | \n",
+ " fe13238540e0ff293ec8aad29aeae6c3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 68 | \n",
+ " abonnement parking | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2021-06-29 22:10:31.167367+02:00 | \n",
+ " 2021-06-29 22:10:31.167367+02:00 | \n",
+ " NaN | \n",
+ " 0f7defc52a97cdca533af74f4e6e5b1e | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 9 | \n",
+ " accreditation match | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2021-06-29 21:33:14.142084+02:00 | \n",
+ " 2021-06-29 21:33:14.142084+02:00 | \n",
+ " NaN | \n",
+ " 40e19a7c4824eaad298e0107ed7e3691 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 154 | \n",
+ " web lnr-lou | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2021-07-16 00:02:17.806521+02:00 | \n",
+ " 2021-07-16 00:02:17.806521+02:00 | \n",
+ " NaN | \n",
+ " b144dd617807b02e0d9002fac6c61768 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name manually_added label itr \\\n",
+ "0 152 plateformeceweb False NaN NaN \n",
+ "1 6 accreditation annuelle False NaN NaN \n",
+ "2 68 abonnement parking False NaN NaN \n",
+ "3 9 accreditation match False NaN NaN \n",
+ "4 154 web lnr-lou False NaN NaN \n",
+ "\n",
+ " updated_at created_at \\\n",
+ "0 2021-07-16 00:02:17.805193+02:00 2021-07-16 00:02:17.805193+02:00 \n",
+ "1 2021-06-29 21:33:14.138349+02:00 2021-06-29 21:33:14.138349+02:00 \n",
+ "2 2021-06-29 22:10:31.167367+02:00 2021-06-29 22:10:31.167367+02:00 \n",
+ "3 2021-06-29 21:33:14.142084+02:00 2021-06-29 21:33:14.142084+02:00 \n",
+ "4 2021-07-16 00:02:17.806521+02:00 2021-07-16 00:02:17.806521+02:00 \n",
+ "\n",
+ " commission identifier \n",
+ "0 NaN 0fc934f49bfa9f1f4e6ab7e2593b6839 \n",
+ "1 NaN fe13238540e0ff293ec8aad29aeae6c3 \n",
+ "2 NaN 0f7defc52a97cdca533af74f4e6e5b1e \n",
+ "3 NaN 40e19a7c4824eaad298e0107ed7e3691 \n",
+ "4 NaN b144dd617807b02e0d9002fac6c61768 "
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"suppliers = display_databases(\"8suppliers.csv\")\n",
"suppliers.head()"
@@ -368,10 +1745,180 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"id": "6b82efce-1dee-4d89-8585-28c4ad477eef",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8products.csv\n",
+ "Shape : (45411, 14)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " category_id | \n",
+ " apply_price | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " extra_field | \n",
+ " amount_consumption | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 90013 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 1961 | \n",
+ " 912 | \n",
+ " 2021-07-16 04:56:05.797551+02:00 | \n",
+ " 2021-07-16 04:56:05.797551+02:00 | \n",
+ " 34 | \n",
+ " 0.0 | \n",
+ " 87917 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 476e111175b1660688b7c13dade2b57e | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 662 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 11 | \n",
+ " 29 | \n",
+ " 2021-06-29 21:33:17.389201+02:00 | \n",
+ " 2021-06-29 21:33:17.389201+02:00 | \n",
+ " 16 | \n",
+ " 0.0 | \n",
+ " 640 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2c765698e9bedd48e8a3fd27dc8dbc97 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 646 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 46 | \n",
+ " 10 | \n",
+ " 2021-06-29 21:33:17.366742+02:00 | \n",
+ " 2021-06-29 21:33:17.366742+02:00 | \n",
+ " 15 | \n",
+ " 0.0 | \n",
+ " 624 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 4e719148651fd7f175e3fb51bdb5d31b | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5703 | \n",
+ " 5.0 | \n",
+ " False | \n",
+ " 7 | \n",
+ " 188 | \n",
+ " 2021-06-29 21:52:09.374365+02:00 | \n",
+ " 2021-06-29 21:52:09.374365+02:00 | \n",
+ " 4 | \n",
+ " 0.0 | \n",
+ " 5540 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " e4d7beeb0a631e2e51e61951623ba9b1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 648 | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 49 | \n",
+ " 10 | \n",
+ " 2021-06-29 21:33:17.369471+02:00 | \n",
+ " 2021-06-29 21:33:17.369471+02:00 | \n",
+ " 15 | \n",
+ " 0.0 | \n",
+ " 626 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 07a5dd9e125345b9458651ab73605255 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id amount is_full_price representation_id pricing_formula_id \\\n",
+ "0 90013 0.0 False 1961 912 \n",
+ "1 662 0.0 False 11 29 \n",
+ "2 646 0.0 False 46 10 \n",
+ "3 5703 5.0 False 7 188 \n",
+ "4 648 0.0 False 49 10 \n",
+ "\n",
+ " created_at updated_at \\\n",
+ "0 2021-07-16 04:56:05.797551+02:00 2021-07-16 04:56:05.797551+02:00 \n",
+ "1 2021-06-29 21:33:17.389201+02:00 2021-06-29 21:33:17.389201+02:00 \n",
+ "2 2021-06-29 21:33:17.366742+02:00 2021-06-29 21:33:17.366742+02:00 \n",
+ "3 2021-06-29 21:52:09.374365+02:00 2021-06-29 21:52:09.374365+02:00 \n",
+ "4 2021-06-29 21:33:17.369471+02:00 2021-06-29 21:33:17.369471+02:00 \n",
+ "\n",
+ " category_id apply_price products_group_id product_pack_id extra_field \\\n",
+ "0 34 0.0 87917 1 NaN \n",
+ "1 16 0.0 640 1 NaN \n",
+ "2 15 0.0 624 1 NaN \n",
+ "3 4 0.0 5540 1 NaN \n",
+ "4 15 0.0 626 1 NaN \n",
+ "\n",
+ " amount_consumption identifier \n",
+ "0 NaN 476e111175b1660688b7c13dade2b57e \n",
+ "1 NaN 2c765698e9bedd48e8a3fd27dc8dbc97 \n",
+ "2 NaN 4e719148651fd7f175e3fb51bdb5d31b \n",
+ "3 NaN e4d7beeb0a631e2e51e61951623ba9b1 \n",
+ "4 NaN 07a5dd9e125345b9458651ab73605255 "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"products = display_databases(\"8products.csv\")\n",
"products.head()"
@@ -395,10 +1942,125 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"id": "daf37bff-a26d-4ff5-ad50-c90f917164bd",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8pricing_formulas.csv\n",
+ "Shape : (516, 6)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " extra_field | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7 | \n",
+ " visite stade enfant | \n",
+ " 2021-06-29 21:33:14.160728+02:00 | \n",
+ " 2021-06-29 21:33:14.160728+02:00 | \n",
+ " NaN | \n",
+ " bbc80e5761a0ea325f6f6a5411752659 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3229 | \n",
+ " tarif bloc etudiants | \n",
+ " 2021-07-16 04:20:46.684601+02:00 | \n",
+ " 2021-09-03 16:44:46.096785+02:00 | \n",
+ " NaN | \n",
+ " 205122cc7e96d559330972b0ec0cf35a | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 42 | \n",
+ " invitation eiffage | \n",
+ " 2021-06-29 21:33:14.204483+02:00 | \n",
+ " 2021-06-29 21:33:14.204483+02:00 | \n",
+ " NaN | \n",
+ " e4e6365c02e2a7b01ebe2ce8ace624f2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4379 | \n",
+ " invitation offre speciale | \n",
+ " 2021-07-16 05:21:44.984893+02:00 | \n",
+ " 2021-07-16 05:21:44.984893+02:00 | \n",
+ " NaN | \n",
+ " 307817b6205535a35915a64027ee161e | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2641 | \n",
+ " prevente reabo enfant | \n",
+ " 2021-07-16 03:47:40.896805+02:00 | \n",
+ " 2021-09-03 16:08:35.304298+02:00 | \n",
+ " NaN | \n",
+ " 478eb63c71ba35d8d3d64c8637dafdee | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 7 visite stade enfant 2021-06-29 21:33:14.160728+02:00 \n",
+ "1 3229 tarif bloc etudiants 2021-07-16 04:20:46.684601+02:00 \n",
+ "2 42 invitation eiffage 2021-06-29 21:33:14.204483+02:00 \n",
+ "3 4379 invitation offre speciale 2021-07-16 05:21:44.984893+02:00 \n",
+ "4 2641 prevente reabo enfant 2021-07-16 03:47:40.896805+02:00 \n",
+ "\n",
+ " updated_at extra_field \\\n",
+ "0 2021-06-29 21:33:14.160728+02:00 NaN \n",
+ "1 2021-09-03 16:44:46.096785+02:00 NaN \n",
+ "2 2021-06-29 21:33:14.204483+02:00 NaN \n",
+ "3 2021-07-16 05:21:44.984893+02:00 NaN \n",
+ "4 2021-09-03 16:08:35.304298+02:00 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 bbc80e5761a0ea325f6f6a5411752659 \n",
+ "1 205122cc7e96d559330972b0ec0cf35a \n",
+ "2 e4e6365c02e2a7b01ebe2ce8ace624f2 \n",
+ "3 307817b6205535a35915a64027ee161e \n",
+ "4 478eb63c71ba35d8d3d64c8637dafdee "
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"pricing_formulas = display_databases(\"8pricing_formulas.csv\")\n",
"pricing_formulas.head()"
@@ -406,10 +2068,118 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"id": "cdb14488-b093-4b39-84fa-1c2b4576208f",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8type_of_pricing_formulas.csv\n",
+ "Shape : (103, 6)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " type_of_id | \n",
+ " pricing_formula_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 1021 | \n",
+ " 2021-09-03 14:17:19.816110+02:00 | \n",
+ " 2021-09-03 14:17:19.816110+02:00 | \n",
+ " 41047fbeb7cd3e1cb2713c608d2f786d | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 7 | \n",
+ " 4305 | \n",
+ " 2021-09-03 14:17:19.848088+02:00 | \n",
+ " 2021-09-03 14:17:19.848088+02:00 | \n",
+ " a62a4dad7d62738129244bbb5ede0747 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 4306 | \n",
+ " 2021-09-03 14:17:19.864067+02:00 | \n",
+ " 2021-09-03 14:17:19.864067+02:00 | \n",
+ " c3770373e09f55412068c447736d9da3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 7 | \n",
+ " 29 | \n",
+ " 2021-09-03 14:17:19.880078+02:00 | \n",
+ " 2021-09-03 14:17:19.880078+02:00 | \n",
+ " 7b7b1242ae7a8c9eb66d35d8a4348ccd | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 8 | \n",
+ " 10 | \n",
+ " 2021-09-03 14:18:03.616081+02:00 | \n",
+ " 2021-09-03 14:18:03.616081+02:00 | \n",
+ " 0a2b941c46b31258c03b316aa064e86a | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id type_of_id pricing_formula_id created_at \\\n",
+ "0 1 7 1021 2021-09-03 14:17:19.816110+02:00 \n",
+ "1 2 7 4305 2021-09-03 14:17:19.848088+02:00 \n",
+ "2 3 7 4306 2021-09-03 14:17:19.864067+02:00 \n",
+ "3 4 7 29 2021-09-03 14:17:19.880078+02:00 \n",
+ "4 5 8 10 2021-09-03 14:18:03.616081+02:00 \n",
+ "\n",
+ " updated_at identifier \n",
+ "0 2021-09-03 14:17:19.816110+02:00 41047fbeb7cd3e1cb2713c608d2f786d \n",
+ "1 2021-09-03 14:17:19.848088+02:00 a62a4dad7d62738129244bbb5ede0747 \n",
+ "2 2021-09-03 14:17:19.864067+02:00 c3770373e09f55412068c447736d9da3 \n",
+ "3 2021-09-03 14:17:19.880078+02:00 7b7b1242ae7a8c9eb66d35d8a4348ccd \n",
+ "4 2021-09-03 14:18:03.616081+02:00 0a2b941c46b31258c03b316aa064e86a "
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"type_pricing_formulas = display_databases(\"8type_of_pricing_formulas.csv\")\n",
"type_pricing_formulas.head()"
@@ -433,10 +2203,131 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"id": "6582694d-5339-4f33-a943-c73033121a90",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8categories.csv\n",
+ "Shape : (148, 7)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " extra_field | \n",
+ " quota | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 653 | \n",
+ " acces village implid | \n",
+ " 2021-07-16 00:04:37.181331+02:00 | \n",
+ " 2021-07-16 00:04:37.181331+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " c447d053646a6503d3cd84d4798bf5b7 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 805 | \n",
+ " parking organisation | \n",
+ " 2021-07-16 01:54:15.822407+02:00 | \n",
+ " 2021-07-16 01:54:15.822407+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 02bf9871964345f505ad305080daec36 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 809 | \n",
+ " rose rouge orange | \n",
+ " 2021-07-16 01:54:15.825345+02:00 | \n",
+ " 2021-07-16 01:54:15.825345+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 31fb5b57bc1a2bcd5c155fb0d9e7c0dd | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2183 | \n",
+ " 2eme catégorie j.b. centrale | \n",
+ " 2021-07-16 04:37:25.446835+02:00 | \n",
+ " 2021-07-16 04:37:25.446835+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " c9eb6651caaed42b809b3f4407a847c9 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 621 | \n",
+ " acces brasserie | \n",
+ " 2021-07-16 00:02:17.249701+02:00 | \n",
+ " 2021-07-16 00:02:17.249701+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 349e6a59585d78d80d46acbc6a520c50 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 653 acces village implid 2021-07-16 00:04:37.181331+02:00 \n",
+ "1 805 parking organisation 2021-07-16 01:54:15.822407+02:00 \n",
+ "2 809 rose rouge orange 2021-07-16 01:54:15.825345+02:00 \n",
+ "3 2183 2eme catégorie j.b. centrale 2021-07-16 04:37:25.446835+02:00 \n",
+ "4 621 acces brasserie 2021-07-16 00:02:17.249701+02:00 \n",
+ "\n",
+ " updated_at extra_field quota \\\n",
+ "0 2021-07-16 00:04:37.181331+02:00 NaN NaN \n",
+ "1 2021-07-16 01:54:15.822407+02:00 NaN NaN \n",
+ "2 2021-07-16 01:54:15.825345+02:00 NaN NaN \n",
+ "3 2021-07-16 04:37:25.446835+02:00 NaN NaN \n",
+ "4 2021-07-16 00:02:17.249701+02:00 NaN NaN \n",
+ "\n",
+ " identifier \n",
+ "0 c447d053646a6503d3cd84d4798bf5b7 \n",
+ "1 02bf9871964345f505ad305080daec36 \n",
+ "2 31fb5b57bc1a2bcd5c155fb0d9e7c0dd \n",
+ "3 c9eb6651caaed42b809b3f4407a847c9 \n",
+ "4 349e6a59585d78d80d46acbc6a520c50 "
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"categories = display_databases(\"8categories.csv\")\n",
"categories.head()"
@@ -444,10 +2335,118 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"id": "589076df-1958-42de-9941-1aff9fa8536f",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8type_of_categories.csv\n",
+ "Shape : (6, 6)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " type_of_id | \n",
+ " category_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2021-08-20 15:22:05.558209+02:00 | \n",
+ " 2021-08-20 15:22:05.558209+02:00 | \n",
+ " af8fa6d57f6b19a7600a69e7771c7c3a | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2021-09-02 17:29:32.582002+02:00 | \n",
+ " 2021-09-02 17:29:32.582002+02:00 | \n",
+ " 63718e7ad306912427758ddf988ad34f | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2021-09-02 17:32:38.299733+02:00 | \n",
+ " 2021-09-02 17:32:38.299733+02:00 | \n",
+ " 5e147d4d90888df14c4584f5c6887c96 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 2021-09-02 17:35:04.748993+02:00 | \n",
+ " 2021-09-02 17:35:04.748993+02:00 | \n",
+ " a9dfdc3f40b41e3018933c6167fc38a5 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 17 | \n",
+ " 2021-09-02 17:35:37.396740+02:00 | \n",
+ " 2021-09-02 17:35:37.396740+02:00 | \n",
+ " c05b0061d2a875adbc35d3dfa6a50a12 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id type_of_id category_id created_at \\\n",
+ "0 1 1 2 2021-08-20 15:22:05.558209+02:00 \n",
+ "1 2 2 1 2021-09-02 17:29:32.582002+02:00 \n",
+ "2 3 3 3 2021-09-02 17:32:38.299733+02:00 \n",
+ "3 4 4 4 2021-09-02 17:35:04.748993+02:00 \n",
+ "4 5 5 17 2021-09-02 17:35:37.396740+02:00 \n",
+ "\n",
+ " updated_at identifier \n",
+ "0 2021-08-20 15:22:05.558209+02:00 af8fa6d57f6b19a7600a69e7771c7c3a \n",
+ "1 2021-09-02 17:29:32.582002+02:00 63718e7ad306912427758ddf988ad34f \n",
+ "2 2021-09-02 17:32:38.299733+02:00 5e147d4d90888df14c4584f5c6887c96 \n",
+ "3 2021-09-02 17:35:04.748993+02:00 a9dfdc3f40b41e3018933c6167fc38a5 \n",
+ "4 2021-09-02 17:35:37.396740+02:00 c05b0061d2a875adbc35d3dfa6a50a12 "
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"type_categories = display_databases(\"8type_of_categories.csv\")\n",
"type_categories.head()"
@@ -473,10 +2472,124 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"id": "6f06d72a-5725-4eee-8e4c-e9ef5820f346",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8representation_category_capacities.csv\n",
+ "Shape : (7378, 7)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " representation_id | \n",
+ " category_id | \n",
+ " expected_filling | \n",
+ " max_filling | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 561 | \n",
+ " 2021-06-29 21:33:14.096827+02:00 | \n",
+ " 2021-06-29 21:33:14.096827+02:00 | \n",
+ " 17 | \n",
+ " 37 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 571 | \n",
+ " 2021-06-29 21:33:14.110047+02:00 | \n",
+ " 2021-06-29 21:33:14.110047+02:00 | \n",
+ " 14 | \n",
+ " 39 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 9665 | \n",
+ " 2021-07-16 00:02:17.736387+02:00 | \n",
+ " 2021-07-16 00:02:17.736387+02:00 | \n",
+ " 1887 | \n",
+ " 8 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 383906 | \n",
+ " 2023-03-04 02:55:01.585418+01:00 | \n",
+ " 2023-03-04 02:55:01.585418+01:00 | \n",
+ " 52729 | \n",
+ " 476 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 393 | \n",
+ " 2021-06-29 21:33:13.876766+02:00 | \n",
+ " 2021-06-29 21:33:13.876766+02:00 | \n",
+ " 9 | \n",
+ " 23 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id created_at updated_at \\\n",
+ "0 561 2021-06-29 21:33:14.096827+02:00 2021-06-29 21:33:14.096827+02:00 \n",
+ "1 571 2021-06-29 21:33:14.110047+02:00 2021-06-29 21:33:14.110047+02:00 \n",
+ "2 9665 2021-07-16 00:02:17.736387+02:00 2021-07-16 00:02:17.736387+02:00 \n",
+ "3 383906 2023-03-04 02:55:01.585418+01:00 2023-03-04 02:55:01.585418+01:00 \n",
+ "4 393 2021-06-29 21:33:13.876766+02:00 2021-06-29 21:33:13.876766+02:00 \n",
+ "\n",
+ " representation_id category_id expected_filling max_filling \n",
+ "0 17 37 NaN NaN \n",
+ "1 14 39 NaN NaN \n",
+ "2 1887 8 NaN NaN \n",
+ "3 52729 476 NaN NaN \n",
+ "4 9 23 NaN NaN "
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"representation_category_capacities = display_databases(\"8representation_category_capacities.csv\")\n",
"representation_category_capacities.head()"
@@ -484,10 +2597,199 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"id": "bd405913-033d-4f15-a5b9-103d577baaff",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8representations.csv\n",
+ "Shape : (1015, 16)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " serial | \n",
+ " event_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " start_date_time | \n",
+ " open | \n",
+ " satisfaction | \n",
+ " end_date_time | \n",
+ " name | \n",
+ " is_display | \n",
+ " representation_type_id | \n",
+ " expected_filling | \n",
+ " max_filling | \n",
+ " extra_field | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5903 | \n",
+ " NaN | \n",
+ " 5836 | \n",
+ " 2021-07-16 05:16:57.419565+02:00 | \n",
+ " 2021-07-16 05:16:57.419565+02:00 | \n",
+ " 2019-08-24 18:00:00+02:00 | \n",
+ " True | \n",
+ " NaN | \n",
+ " 1901-01-01 00:09:21+00:09 | \n",
+ " NaN | \n",
+ " True | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 8009c34cae4e79e3781f16f3ceeab244 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 67133 | \n",
+ " NaN | \n",
+ " 65652 | \n",
+ " 2023-09-27 02:21:36.573001+02:00 | \n",
+ " 2023-09-27 02:21:36.573001+02:00 | \n",
+ " 2023-10-04 10:30:00+02:00 | \n",
+ " True | \n",
+ " NaN | \n",
+ " 1901-01-01 00:09:21+00:09 | \n",
+ " NaN | \n",
+ " True | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 4e9d3fc8d1f7bf563dc586548fe6390e | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1874 | \n",
+ " NaN | \n",
+ " 1826 | \n",
+ " 2021-07-16 00:02:17.390274+02:00 | \n",
+ " 2021-07-16 00:02:17.390274+02:00 | \n",
+ " 2019-09-14 18:00:00+02:00 | \n",
+ " True | \n",
+ " NaN | \n",
+ " 1901-01-01 00:09:21+00:09 | \n",
+ " NaN | \n",
+ " True | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 19f666370c1fc781dff638c20ae04c8a | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5904 | \n",
+ " NaN | \n",
+ " 5837 | \n",
+ " 2021-07-16 05:16:57.420302+02:00 | \n",
+ " 2021-07-16 05:16:57.420302+02:00 | \n",
+ " 2019-09-01 17:05:00+02:00 | \n",
+ " True | \n",
+ " NaN | \n",
+ " 1901-01-01 00:09:21+00:09 | \n",
+ " NaN | \n",
+ " True | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 4221acd3f49179f5d0b292c15d1ab8e4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4165 | \n",
+ " NaN | \n",
+ " 4106 | \n",
+ " 2021-07-16 03:53:05.929713+02:00 | \n",
+ " 2021-07-16 03:53:05.929713+02:00 | \n",
+ " 2018-10-14 14:00:00+02:00 | \n",
+ " True | \n",
+ " NaN | \n",
+ " 1901-01-01 00:09:21+00:09 | \n",
+ " NaN | \n",
+ " True | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 733104286519c0614b2d45470eb180a1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id serial event_id created_at \\\n",
+ "0 5903 NaN 5836 2021-07-16 05:16:57.419565+02:00 \n",
+ "1 67133 NaN 65652 2023-09-27 02:21:36.573001+02:00 \n",
+ "2 1874 NaN 1826 2021-07-16 00:02:17.390274+02:00 \n",
+ "3 5904 NaN 5837 2021-07-16 05:16:57.420302+02:00 \n",
+ "4 4165 NaN 4106 2021-07-16 03:53:05.929713+02:00 \n",
+ "\n",
+ " updated_at start_date_time open \\\n",
+ "0 2021-07-16 05:16:57.419565+02:00 2019-08-24 18:00:00+02:00 True \n",
+ "1 2023-09-27 02:21:36.573001+02:00 2023-10-04 10:30:00+02:00 True \n",
+ "2 2021-07-16 00:02:17.390274+02:00 2019-09-14 18:00:00+02:00 True \n",
+ "3 2021-07-16 05:16:57.420302+02:00 2019-09-01 17:05:00+02:00 True \n",
+ "4 2021-07-16 03:53:05.929713+02:00 2018-10-14 14:00:00+02:00 True \n",
+ "\n",
+ " satisfaction end_date_time name is_display \\\n",
+ "0 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
+ "1 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
+ "2 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
+ "3 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
+ "4 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
+ "\n",
+ " representation_type_id expected_filling max_filling extra_field \\\n",
+ "0 NaN NaN NaN NaN \n",
+ "1 NaN NaN NaN NaN \n",
+ "2 NaN NaN NaN NaN \n",
+ "3 NaN NaN NaN NaN \n",
+ "4 NaN NaN NaN NaN \n",
+ "\n",
+ " identifier \n",
+ "0 8009c34cae4e79e3781f16f3ceeab244 \n",
+ "1 4e9d3fc8d1f7bf563dc586548fe6390e \n",
+ "2 19f666370c1fc781dff638c20ae04c8a \n",
+ "3 4221acd3f49179f5d0b292c15d1ab8e4 \n",
+ "4 733104286519c0614b2d45470eb180a1 "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"representations = display_databases(\"8representations.csv\")\n",
"representations.head()"
@@ -495,7 +2797,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 24,
"id": "0f2c7ea3-6964-48fd-9411-17547b2c3a3f",
"metadata": {},
"outputs": [],
@@ -521,10 +2823,168 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"id": "cba22ee2-338d-4ce1-a1e8-829a11a94bcf",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8events.csv\n",
+ "Shape : (922, 12)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " season_id | \n",
+ " facility_id | \n",
+ " name | \n",
+ " event_type_id | \n",
+ " manual_added | \n",
+ " is_display | \n",
+ " event_type_key_id | \n",
+ " facility_key_id | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 41542 | \n",
+ " 2022-10-29 02:54:32.756920+02:00 | \n",
+ " 2022-10-29 02:57:35.511792+02:00 | \n",
+ " 52 | \n",
+ " 1 | \n",
+ " match lou feminin - lons | \n",
+ " 5588 | \n",
+ " False | \n",
+ " True | \n",
+ " 5588 | \n",
+ " 1 | \n",
+ " 40cc5a346b1af4ee7108ac28b144fb77 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 21068 | \n",
+ " 2021-12-17 03:43:53.166446+01:00 | \n",
+ " 2021-12-17 03:46:40.346096+01:00 | \n",
+ " 51 | \n",
+ " 1 | \n",
+ " repas brasserie lou-racing | \n",
+ " 2310 | \n",
+ " False | \n",
+ " True | \n",
+ " 2310 | \n",
+ " 1 | \n",
+ " 500b670b79aa592ecb06f4957800a752 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 59812 | \n",
+ " 2023-05-26 01:45:54.321665+02:00 | \n",
+ " 2023-05-26 01:46:01.571397+02:00 | \n",
+ " 1501 | \n",
+ " 2 | \n",
+ " parking match 2 | \n",
+ " 10185 | \n",
+ " False | \n",
+ " True | \n",
+ " 10185 | \n",
+ " 2 | \n",
+ " d5f62ed879867b8b51ed7b85f1fc3ab0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3424 | \n",
+ " 2021-07-16 03:13:06.988358+02:00 | \n",
+ " 2021-07-16 05:33:31.321933+02:00 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " rugby + hockey sur glace | \n",
+ " 5 | \n",
+ " False | \n",
+ " True | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 822b47176c355a647aa2dbdf8dfbc594 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 21379 | \n",
+ " 2021-12-23 02:37:22.948114+01:00 | \n",
+ " 2021-12-23 02:38:20.726329+01:00 | \n",
+ " 51 | \n",
+ " 1 | \n",
+ " bloc des etudiants lou-racing | \n",
+ " 2562 | \n",
+ " False | \n",
+ " True | \n",
+ " 2562 | \n",
+ " 1 | \n",
+ " 17b91f19c71ff6287ffc1f44af952576 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id created_at updated_at \\\n",
+ "0 41542 2022-10-29 02:54:32.756920+02:00 2022-10-29 02:57:35.511792+02:00 \n",
+ "1 21068 2021-12-17 03:43:53.166446+01:00 2021-12-17 03:46:40.346096+01:00 \n",
+ "2 59812 2023-05-26 01:45:54.321665+02:00 2023-05-26 01:46:01.571397+02:00 \n",
+ "3 3424 2021-07-16 03:13:06.988358+02:00 2021-07-16 05:33:31.321933+02:00 \n",
+ "4 21379 2021-12-23 02:37:22.948114+01:00 2021-12-23 02:38:20.726329+01:00 \n",
+ "\n",
+ " season_id facility_id name event_type_id \\\n",
+ "0 52 1 match lou feminin - lons 5588 \n",
+ "1 51 1 repas brasserie lou-racing 2310 \n",
+ "2 1501 2 parking match 2 10185 \n",
+ "3 1 1 rugby + hockey sur glace 5 \n",
+ "4 51 1 bloc des etudiants lou-racing 2562 \n",
+ "\n",
+ " manual_added is_display event_type_key_id facility_key_id \\\n",
+ "0 False True 5588 1 \n",
+ "1 False True 2310 1 \n",
+ "2 False True 10185 2 \n",
+ "3 False True 5 1 \n",
+ "4 False True 2562 1 \n",
+ "\n",
+ " identifier \n",
+ "0 40cc5a346b1af4ee7108ac28b144fb77 \n",
+ "1 500b670b79aa592ecb06f4957800a752 \n",
+ "2 d5f62ed879867b8b51ed7b85f1fc3ab0 \n",
+ "3 822b47176c355a647aa2dbdf8dfbc594 \n",
+ "4 17b91f19c71ff6287ffc1f44af952576 "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"events = display_databases(\"8events.csv\")\n",
"events.head()"
@@ -532,10 +2992,125 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"id": "3db00b9d-2187-4cb6-980d-8ac6ab9eb460",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8event_types.csv\n",
+ "Shape : (73, 6)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " fidelity_delay | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " standard | \n",
+ " 2021-06-29 13:52:10.434850+02:00 | \n",
+ " 2021-06-29 13:52:10.434850+02:00 | \n",
+ " 36 | \n",
+ " c00f0c4675b91fb8b918e4079a0b1bac | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 11 | \n",
+ " ptit lou | \n",
+ " 2021-06-29 21:33:13.000743+02:00 | \n",
+ " 2021-06-29 21:33:13.000743+02:00 | \n",
+ " 36 | \n",
+ " dedd3579bc13b3ed7a90277247d9944b | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 274 | \n",
+ " parking 19-20 | \n",
+ " 2021-07-16 00:02:17.225410+02:00 | \n",
+ " 2021-07-16 00:02:17.225410+02:00 | \n",
+ " 36 | \n",
+ " 0d348caeec0b66f9d4987dfbe30e1e8b | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 129 | \n",
+ " events 2018-2019 | \n",
+ " 2021-06-30 01:35:18.110429+02:00 | \n",
+ " 2021-06-30 01:35:18.110429+02:00 | \n",
+ " 36 | \n",
+ " 65eb39ddf8f79d28d93c2f2c53118f50 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 10 | \n",
+ " accreditations 2017-2018 | \n",
+ " 2021-06-29 21:33:12.999510+02:00 | \n",
+ " 2021-06-29 21:33:12.999510+02:00 | \n",
+ " 36 | \n",
+ " 732cfdcf2065fa0005faf42793ddd76c | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 1 standard 2021-06-29 13:52:10.434850+02:00 \n",
+ "1 11 ptit lou 2021-06-29 21:33:13.000743+02:00 \n",
+ "2 274 parking 19-20 2021-07-16 00:02:17.225410+02:00 \n",
+ "3 129 events 2018-2019 2021-06-30 01:35:18.110429+02:00 \n",
+ "4 10 accreditations 2017-2018 2021-06-29 21:33:12.999510+02:00 \n",
+ "\n",
+ " updated_at fidelity_delay \\\n",
+ "0 2021-06-29 13:52:10.434850+02:00 36 \n",
+ "1 2021-06-29 21:33:13.000743+02:00 36 \n",
+ "2 2021-07-16 00:02:17.225410+02:00 36 \n",
+ "3 2021-06-30 01:35:18.110429+02:00 36 \n",
+ "4 2021-06-29 21:33:12.999510+02:00 36 \n",
+ "\n",
+ " identifier \n",
+ "0 c00f0c4675b91fb8b918e4079a0b1bac \n",
+ "1 dedd3579bc13b3ed7a90277247d9944b \n",
+ "2 0d348caeec0b66f9d4987dfbe30e1e8b \n",
+ "3 65eb39ddf8f79d28d93c2f2c53118f50 \n",
+ "4 732cfdcf2065fa0005faf42793ddd76c "
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"event_types = display_databases(\"8event_types.csv\")\n",
"event_types.head()"
@@ -543,10 +3118,125 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 27,
"id": "cba0ee58-6280-45fe-99b3-0be09db5922b",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8seasons.csv\n",
+ "Shape : (16, 6)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " start_date_time | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1501 | \n",
+ " saison 2023-2024 | \n",
+ " 2022-06-25 03:07:31.209270+02:00 | \n",
+ " 2022-06-25 03:07:31.209270+02:00 | \n",
+ " NaN | \n",
+ " 71f5c069ce45c5e933dcc37c22507fbf | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1194 | \n",
+ " saison 2049-2050 | \n",
+ " 2022-02-17 03:24:23.942691+01:00 | \n",
+ " 2022-02-17 03:24:23.942691+01:00 | \n",
+ " NaN | \n",
+ " 44e20620bbc5926db2e295d38b606afd | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " saison 2016-2017 | \n",
+ " 2021-06-29 21:33:00.702563+02:00 | \n",
+ " 2021-06-29 21:33:00.702563+02:00 | \n",
+ " NaN | \n",
+ " f9cf989d4f49300220df67ef93aa2294 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 47 | \n",
+ " saison 2018-2019 | \n",
+ " 2021-06-30 01:35:15.156097+02:00 | \n",
+ " 2021-06-30 01:35:15.156097+02:00 | \n",
+ " NaN | \n",
+ " eec50c35fbf8593b364ced287335d90c | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 100 | \n",
+ " saison 2010-2011 | \n",
+ " 2021-07-16 00:23:27.607648+02:00 | \n",
+ " 2021-07-16 00:23:27.607648+02:00 | \n",
+ " NaN | \n",
+ " 7ccc51049a85e0df9b80662e45b6ddb8 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 1501 saison 2023-2024 2022-06-25 03:07:31.209270+02:00 \n",
+ "1 1194 saison 2049-2050 2022-02-17 03:24:23.942691+01:00 \n",
+ "2 2 saison 2016-2017 2021-06-29 21:33:00.702563+02:00 \n",
+ "3 47 saison 2018-2019 2021-06-30 01:35:15.156097+02:00 \n",
+ "4 100 saison 2010-2011 2021-07-16 00:23:27.607648+02:00 \n",
+ "\n",
+ " updated_at start_date_time \\\n",
+ "0 2022-06-25 03:07:31.209270+02:00 NaN \n",
+ "1 2022-02-17 03:24:23.942691+01:00 NaN \n",
+ "2 2021-06-29 21:33:00.702563+02:00 NaN \n",
+ "3 2021-06-30 01:35:15.156097+02:00 NaN \n",
+ "4 2021-07-16 00:23:27.607648+02:00 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 71f5c069ce45c5e933dcc37c22507fbf \n",
+ "1 44e20620bbc5926db2e295d38b606afd \n",
+ "2 f9cf989d4f49300220df67ef93aa2294 \n",
+ "3 eec50c35fbf8593b364ced287335d90c \n",
+ "4 7ccc51049a85e0df9b80662e45b6ddb8 "
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"seasons = display_databases(\"8seasons.csv\")\n",
"seasons.head()"
@@ -554,10 +3244,131 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 28,
"id": "6fa82fd7-d6d3-4857-af24-ea573b1129d0",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/8/8facilities.csv\n",
+ "Shape : (5, 7)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " street_id | \n",
+ " fixed_capacity | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 74 | \n",
+ " plan pour campagne d'abo 2011/2012 | \n",
+ " 2021-07-16 00:23:30.337698+02:00 | \n",
+ " 2021-07-16 00:23:30.337698+02:00 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 2e1d25d5f7e46e23c734fe0e4951390e | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3 | \n",
+ " accreditation | \n",
+ " 2021-06-29 21:33:13.018552+02:00 | \n",
+ " 2021-06-29 21:33:13.018552+02:00 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " da37a04e592cbd344142730ce05a6887 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4 | \n",
+ " organisation match exterieur | \n",
+ " 2021-06-29 21:33:13.019878+02:00 | \n",
+ " 2021-06-29 21:33:13.019878+02:00 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 8f9ee8c2e954585f7c68096d7f1cf4f1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2 | \n",
+ " parking matmut stadium | \n",
+ " 2021-06-29 21:33:13.017165+02:00 | \n",
+ " 2021-06-29 21:33:13.017165+02:00 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " aeab282982ea738674dbf5c3763a0be0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " matmut stadium | \n",
+ " 2021-06-29 21:33:13.004560+02:00 | \n",
+ " 2021-06-29 21:33:13.004560+02:00 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 89feffd283ebdabdc3b81fb62ea4f6f0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 74 plan pour campagne d'abo 2011/2012 2021-07-16 00:23:30.337698+02:00 \n",
+ "1 3 accreditation 2021-06-29 21:33:13.018552+02:00 \n",
+ "2 4 organisation match exterieur 2021-06-29 21:33:13.019878+02:00 \n",
+ "3 2 parking matmut stadium 2021-06-29 21:33:13.017165+02:00 \n",
+ "4 1 matmut stadium 2021-06-29 21:33:13.004560+02:00 \n",
+ "\n",
+ " updated_at street_id fixed_capacity \\\n",
+ "0 2021-07-16 00:23:30.337698+02:00 1 NaN \n",
+ "1 2021-06-29 21:33:13.018552+02:00 1 NaN \n",
+ "2 2021-06-29 21:33:13.019878+02:00 1 NaN \n",
+ "3 2021-06-29 21:33:13.017165+02:00 1 NaN \n",
+ "4 2021-06-29 21:33:13.004560+02:00 1 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 2e1d25d5f7e46e23c734fe0e4951390e \n",
+ "1 da37a04e592cbd344142730ce05a6887 \n",
+ "2 8f9ee8c2e954585f7c68096d7f1cf4f1 \n",
+ "3 aeab282982ea738674dbf5c3763a0be0 \n",
+ "4 89feffd283ebdabdc3b81fb62ea4f6f0 "
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"facilities = display_databases(\"8facilities.csv\")\n",
"facilities.head()"
@@ -597,7 +3408,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 29,
"id": "c240b811-48a6-4501-9e70-bc51d69e3ac4",
"metadata": {},
"outputs": [],
@@ -613,7 +3424,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 30,
"id": "54057367-9df9-42f4-aa07-bf524bb76462",
"metadata": {},
"outputs": [
@@ -634,7 +3445,7 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 31,
"id": "63914e20-9efc-4088-877b-edab5f225d00",
"metadata": {},
"outputs": [
@@ -670,13 +3481,3683 @@
"## Create Universal database"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "7e460fbe-5067-4998-a1a8-9e3d07401750",
+ "metadata": {},
+ "source": [
+ "We will first create a procedure to clean the datasets of a company and then merge them. Hence, we will be able to replicate this procedure for all companies and create a universal database.\n",
+ "\n",
+ "Let's first create our procedure for the company 1 and the datasets belongings to the theme producst"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 32,
+ "id": "590a132a-4f57-4ea3-a282-2ef913e4b753",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "directory_path = '1'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
"id": "0fbebfb7-a827-46b1-890b-86c9def7cdbb",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "theme_products = [\"products.csv\" ,\"categories.csv\", \"type_of_categories.csv\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "b8aa5f8f-845e-4ee5-b80d-38b7061a94a2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def remove_horodates(df):\n",
+ " \"\"\"\n",
+ " this function remove horodate columns like created_at and updated_at\n",
+ " \"\"\"\n",
+ " df = df.drop(columns = [\"created_at\", \"updated_at\"])\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "2c478213-09ae-44ef-8c7c-125bcb571642",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def order_columns_id(df):\n",
+ " \"\"\"\n",
+ " this function puts all id columns at the beginning in order to read the dataset easier\n",
+ " \"\"\"\n",
+ " substring = 'id'\n",
+ " id_columns = [col for col in df.columns if substring in col]\n",
+ " remaining_col = [col for col in df.columns if substring not in col]\n",
+ " new_order = id_columns + remaining_col\n",
+ " return df[new_order]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "327e44b0-eb99-4022-b4ca-79548072f0f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def percent_na(df):\n",
+ " \"\"\"\n",
+ " this function returns the percentage of na for each column\n",
+ " \"\"\"\n",
+ " percent_missing = df.isna().sum() * 100 / len(df)\n",
+ " return percent_missing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "10926def-267f-4e86-b2c9-72e27ff9a9df",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def process_df(df):\n",
+ " df = remove_horodates(df)\n",
+ " print(\"Number of columns : \", len(df.columns))\n",
+ " df = order_columns_id(df)\n",
+ " print(\"Columns : \", df.columns)\n",
+ " print(\"Percent of NA for each column : \", percent_na(df))\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "98ac02cb-5295-47ca-99c6-99e622c5f388",
+ "metadata": {},
+ "source": [
+ "#### Deep analysis of products.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "862a7658-0602-4d94-bb58-d23774c00d32",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1products.csv\n",
+ "Shape : (94803, 14)\n",
+ "Number of columns : 14\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " category_id | \n",
+ " apply_price | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " extra_field | \n",
+ " amount_consumption | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10682 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 2020-09-03 14:09:43.119798+02:00 | \n",
+ " 2020-09-03 14:09:43.119798+02:00 | \n",
+ " 41 | \n",
+ " 0.0 | \n",
+ " 10655 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 35c88f2db8a63d7474e46eb8ca9260e7 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 478 | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 2020-09-03 13:21:22.711773+02:00 | \n",
+ " 2020-09-03 13:21:22.711773+02:00 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 471 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 8a179671ab198e570e6a104c4451379f | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20873 | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 2020-09-03 14:46:33.589030+02:00 | \n",
+ " 2020-09-03 14:46:33.589030+02:00 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 20825 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ee83779ce29e67ad251e40234b426d6a | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 157142 | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " 82519 | \n",
+ " 9 | \n",
+ " 2022-01-28 19:29:23.525722+01:00 | \n",
+ " 2022-01-28 19:29:23.525722+01:00 | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 156773 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d865383579314b791aa4bcf3fb418f17 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1341 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " 9 | \n",
+ " 93 | \n",
+ " 2020-09-03 13:29:30.773089+02:00 | \n",
+ " 2020-09-03 13:29:30.773089+02:00 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 1175 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " f1c4689bc47dee6f60b56d74b593dd46 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id amount is_full_price representation_id pricing_formula_id \\\n",
+ "0 10682 9.0 False 914 114 \n",
+ "1 478 9.5 False 273 131 \n",
+ "2 20873 11.5 False 275 137 \n",
+ "3 157142 8.0 False 82519 9 \n",
+ "4 1341 8.5 False 9 93 \n",
+ "\n",
+ " created_at updated_at \\\n",
+ "0 2020-09-03 14:09:43.119798+02:00 2020-09-03 14:09:43.119798+02:00 \n",
+ "1 2020-09-03 13:21:22.711773+02:00 2020-09-03 13:21:22.711773+02:00 \n",
+ "2 2020-09-03 14:46:33.589030+02:00 2020-09-03 14:46:33.589030+02:00 \n",
+ "3 2022-01-28 19:29:23.525722+01:00 2022-01-28 19:29:23.525722+01:00 \n",
+ "4 2020-09-03 13:29:30.773089+02:00 2020-09-03 13:29:30.773089+02:00 \n",
+ "\n",
+ " category_id apply_price products_group_id product_pack_id extra_field \\\n",
+ "0 41 0.0 10655 1 NaN \n",
+ "1 1 0.0 471 1 NaN \n",
+ "2 1 0.0 20825 1 NaN \n",
+ "3 5 0.0 156773 1 NaN \n",
+ "4 1 0.0 1175 1 NaN \n",
+ "\n",
+ " amount_consumption identifier \n",
+ "0 NaN 35c88f2db8a63d7474e46eb8ca9260e7 \n",
+ "1 NaN 8a179671ab198e570e6a104c4451379f \n",
+ "2 NaN ee83779ce29e67ad251e40234b426d6a \n",
+ "3 NaN d865383579314b791aa4bcf3fb418f17 \n",
+ "4 NaN f1c4689bc47dee6f60b56d74b593dd46 "
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products = display_databases(\"1products.csv\")\n",
+ "print(\"Number of columns : \", len(products.columns))\n",
+ "products.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "f0db8c51-2792-4d49-9b1a-d98ce0d9ea28",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of columns : 12\n",
+ "Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
+ " 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
+ " 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " category_id | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " identifier | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " apply_price | \n",
+ " extra_field | \n",
+ " amount_consumption | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10682 | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 41 | \n",
+ " 10655 | \n",
+ " 1 | \n",
+ " 35c88f2db8a63d7474e46eb8ca9260e7 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 478 | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 1 | \n",
+ " 471 | \n",
+ " 1 | \n",
+ " 8a179671ab198e570e6a104c4451379f | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20873 | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 1 | \n",
+ " 20825 | \n",
+ " 1 | \n",
+ " ee83779ce29e67ad251e40234b426d6a | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 157142 | \n",
+ " 82519 | \n",
+ " 9 | \n",
+ " 5 | \n",
+ " 156773 | \n",
+ " 1 | \n",
+ " d865383579314b791aa4bcf3fb418f17 | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1341 | \n",
+ " 9 | \n",
+ " 93 | \n",
+ " 1 | \n",
+ " 1175 | \n",
+ " 1 | \n",
+ " f1c4689bc47dee6f60b56d74b593dd46 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id representation_id pricing_formula_id category_id \\\n",
+ "0 10682 914 114 41 \n",
+ "1 478 273 131 1 \n",
+ "2 20873 275 137 1 \n",
+ "3 157142 82519 9 5 \n",
+ "4 1341 9 93 1 \n",
+ "\n",
+ " products_group_id product_pack_id identifier \\\n",
+ "0 10655 1 35c88f2db8a63d7474e46eb8ca9260e7 \n",
+ "1 471 1 8a179671ab198e570e6a104c4451379f \n",
+ "2 20825 1 ee83779ce29e67ad251e40234b426d6a \n",
+ "3 156773 1 d865383579314b791aa4bcf3fb418f17 \n",
+ "4 1175 1 f1c4689bc47dee6f60b56d74b593dd46 \n",
+ "\n",
+ " amount is_full_price apply_price extra_field amount_consumption \n",
+ "0 9.0 False 0.0 NaN NaN \n",
+ "1 9.5 False 0.0 NaN NaN \n",
+ "2 11.5 False 0.0 NaN NaN \n",
+ "3 8.0 False 0.0 NaN NaN \n",
+ "4 8.5 False 0.0 NaN NaN "
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products = remove_horodates(products)\n",
+ "print(\"Number of columns : \", len(products.columns))\n",
+ "products = order_columns_id(products)\n",
+ "print(\"Columns : \", products.columns)\n",
+ "products.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "a383474f-7da9-422c-bb69-3f0cc0b7053f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "id int64\n",
+ "representation_id int64\n",
+ "pricing_formula_id int64\n",
+ "category_id int64\n",
+ "products_group_id int64\n",
+ "product_pack_id int64\n",
+ "identifier object\n",
+ "amount float64\n",
+ "is_full_price bool\n",
+ "apply_price float64\n",
+ "extra_field float64\n",
+ "amount_consumption float64\n",
+ "dtype: object\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(products.dtypes)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "460749ac-aa26-4216-8667-518546f72f72",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "id 0.0\n",
+ "representation_id 0.0\n",
+ "pricing_formula_id 0.0\n",
+ "category_id 0.0\n",
+ "products_group_id 0.0\n",
+ "product_pack_id 0.0\n",
+ "identifier 0.0\n",
+ "amount 0.0\n",
+ "is_full_price 0.0\n",
+ "apply_price 0.0\n",
+ "extra_field 100.0\n",
+ "amount_consumption 100.0\n",
+ "dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "percent_missing = products.isna().sum() * 100 / len(products)\n",
+ "print(percent_missing)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ebcb48ab-adad-42e5-b5d7-7275771cd200",
+ "metadata": {},
+ "source": [
+ "#### Deep analysis of categories.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "3efce2b6-2d2f-4da9-98ed-1aae17da624c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "name_dataset = '1categories.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "id": "38aa39fd-58af-4fb8-98f2-4269dbaf35de",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1categories.csv\n",
+ "Shape : (27, 7)\n",
+ "Number of columns : 7\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " extra_field | \n",
+ " quota | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 30 | \n",
+ " en nb entrées gr | \n",
+ " 2020-09-03 13:21:20.019202+02:00 | \n",
+ " 2020-09-03 13:21:20.019202+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 849ab2791a14f5fc2bb4d87ab2b78bf6 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 16 | \n",
+ " indiv activité enfant | \n",
+ " 2020-09-03 13:11:23.306968+02:00 | \n",
+ " 2020-09-03 13:11:23.306968+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 425fd2f01984cc4ba030c1be98f42c33 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 39 | \n",
+ " indiv activité gr | \n",
+ " 2020-09-03 13:21:20.029901+02:00 | \n",
+ " 2020-09-03 13:21:20.029901+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 9244dd3738788db0d22a5d0afe687b69 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1108 | \n",
+ " groupe forfait adulte | \n",
+ " 2020-09-19 02:06:43.145697+02:00 | \n",
+ " 2020-09-19 02:06:43.145697+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 3edda20c877a93b5ff883827238eb711 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 6 | \n",
+ " groupe forfait entrées tr | \n",
+ " 2020-09-03 13:11:23.264997+02:00 | \n",
+ " 2020-09-03 13:11:23.264997+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ff48df4b2dd5a14116bf4d280b31621e | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 30 en nb entrées gr 2020-09-03 13:21:20.019202+02:00 \n",
+ "1 16 indiv activité enfant 2020-09-03 13:11:23.306968+02:00 \n",
+ "2 39 indiv activité gr 2020-09-03 13:21:20.029901+02:00 \n",
+ "3 1108 groupe forfait adulte 2020-09-19 02:06:43.145697+02:00 \n",
+ "4 6 groupe forfait entrées tr 2020-09-03 13:11:23.264997+02:00 \n",
+ "\n",
+ " updated_at extra_field quota \\\n",
+ "0 2020-09-03 13:21:20.019202+02:00 NaN NaN \n",
+ "1 2020-09-03 13:11:23.306968+02:00 NaN NaN \n",
+ "2 2020-09-03 13:21:20.029901+02:00 NaN NaN \n",
+ "3 2020-09-19 02:06:43.145697+02:00 NaN NaN \n",
+ "4 2020-09-03 13:11:23.264997+02:00 NaN NaN \n",
+ "\n",
+ " identifier \n",
+ "0 849ab2791a14f5fc2bb4d87ab2b78bf6 \n",
+ "1 425fd2f01984cc4ba030c1be98f42c33 \n",
+ "2 9244dd3738788db0d22a5d0afe687b69 \n",
+ "3 3edda20c877a93b5ff883827238eb711 \n",
+ "4 ff48df4b2dd5a14116bf4d280b31621e "
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = display_databases(name_dataset)\n",
+ "print(\"Number of columns : \", len(df.columns))\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "99eb6d14-8b4b-4d55-8fc7-ddf2726096f4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
+ "Percent of NA for each column : id 0.000000\n",
+ "identifier 0.000000\n",
+ "name 3.703704\n",
+ "extra_field 100.000000\n",
+ "quota 100.000000\n",
+ "dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " identifier | \n",
+ " name | \n",
+ " extra_field | \n",
+ " quota | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 30 | \n",
+ " 849ab2791a14f5fc2bb4d87ab2b78bf6 | \n",
+ " en nb entrées gr | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 16 | \n",
+ " 425fd2f01984cc4ba030c1be98f42c33 | \n",
+ " indiv activité enfant | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 39 | \n",
+ " 9244dd3738788db0d22a5d0afe687b69 | \n",
+ " indiv activité gr | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1108 | \n",
+ " 3edda20c877a93b5ff883827238eb711 | \n",
+ " groupe forfait adulte | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 6 | \n",
+ " ff48df4b2dd5a14116bf4d280b31621e | \n",
+ " groupe forfait entrées tr | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id identifier name \\\n",
+ "0 30 849ab2791a14f5fc2bb4d87ab2b78bf6 en nb entrées gr \n",
+ "1 16 425fd2f01984cc4ba030c1be98f42c33 indiv activité enfant \n",
+ "2 39 9244dd3738788db0d22a5d0afe687b69 indiv activité gr \n",
+ "3 1108 3edda20c877a93b5ff883827238eb711 groupe forfait adulte \n",
+ "4 6 ff48df4b2dd5a14116bf4d280b31621e groupe forfait entrées tr \n",
+ "\n",
+ " extra_field quota \n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = process_df(df)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "c5f39cc9-dff8-452c-9a3e-9f7df81a8a19",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "identifier object\n",
+ "name object\n",
+ "extra_field float64\n",
+ "quota float64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c4cb0b37-2262-45c0-97be-b12c503016e3",
+ "metadata": {},
+ "source": [
+ "#### Deep analysis of type_of_categories.csv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b4a3af9-ed12-43ec-b17e-fd425b238265",
+ "metadata": {},
+ "source": [
+ "#### Deep analysis of representation_category_capacities.csv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "135966fb-aab1-48d7-bb4c-39a53ee643ca",
+ "metadata": {},
+ "source": [
+ "#### Deep analysis of representations.csv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b480f39f-d5c7-4ded-8f64-ea8ac31f5db5",
+ "metadata": {},
+ "source": [
+ "#### Deep analysis of events.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "2d52d6da-cca5-4abd-be05-2f00fd3eca8e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "name_dataset = '1events.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "6cab507d-8b11-404d-9286-5cc205228af9",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1events.csv\n",
+ "Shape : (1232, 12)\n",
+ "Number of columns : 12\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " season_id | \n",
+ " facility_id | \n",
+ " name | \n",
+ " event_type_id | \n",
+ " manual_added | \n",
+ " is_display | \n",
+ " event_type_key_id | \n",
+ " facility_key_id | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 192 | \n",
+ " 2020-09-03 13:36:42.216991+02:00 | \n",
+ " 2021-11-02 15:06:40.663219+01:00 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " frontières | \n",
+ " 4 | \n",
+ " False | \n",
+ " True | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " c1cecd093146068fd57896e254e98170 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 30329 | \n",
+ " 2023-11-04 02:50:34.602462+01:00 | \n",
+ " 2023-11-04 02:52:26.138154+01:00 | \n",
+ " 2767 | \n",
+ " 1 | \n",
+ " visite guidée une autre histoire du monde (1h00) | \n",
+ " 5 | \n",
+ " False | \n",
+ " True | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " f510a6710878d7aca36e71c54abab525 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 161 | \n",
+ " 2020-09-03 13:29:27.944002+02:00 | \n",
+ " 2021-11-02 15:06:40.652026+01:00 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " visite contée les chercheurs d'or indiv | \n",
+ " 2 | \n",
+ " False | \n",
+ " True | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 21177fa9acad1ae2b1f595690fb853d3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5957 | \n",
+ " 2021-07-31 11:16:42.575583+02:00 | \n",
+ " 2021-11-02 15:06:40.663219+01:00 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " we dreamt of utopia and we woke up screaming. | \n",
+ " 4 | \n",
+ " False | \n",
+ " True | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 962601f1eb153d45d49437f8fe839f7f | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 8337 | \n",
+ " 2021-08-17 13:40:34.111923+02:00 | \n",
+ " 2021-11-02 15:06:40.663219+01:00 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " jeff koons épisodes 4 | \n",
+ " 4 | \n",
+ " False | \n",
+ " True | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " bfa22f5a2364a2dacfc45cca1c8d3215 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id created_at updated_at \\\n",
+ "0 192 2020-09-03 13:36:42.216991+02:00 2021-11-02 15:06:40.663219+01:00 \n",
+ "1 30329 2023-11-04 02:50:34.602462+01:00 2023-11-04 02:52:26.138154+01:00 \n",
+ "2 161 2020-09-03 13:29:27.944002+02:00 2021-11-02 15:06:40.652026+01:00 \n",
+ "3 5957 2021-07-31 11:16:42.575583+02:00 2021-11-02 15:06:40.663219+01:00 \n",
+ "4 8337 2021-08-17 13:40:34.111923+02:00 2021-11-02 15:06:40.663219+01:00 \n",
+ "\n",
+ " season_id facility_id name \\\n",
+ "0 16 1 frontières \n",
+ "1 2767 1 visite guidée une autre histoire du monde (1h00) \n",
+ "2 16 1 visite contée les chercheurs d'or indiv \n",
+ "3 582 1 we dreamt of utopia and we woke up screaming. \n",
+ "4 582 1 jeff koons épisodes 4 \n",
+ "\n",
+ " event_type_id manual_added is_display event_type_key_id \\\n",
+ "0 4 False True 4 \n",
+ "1 5 False True 5 \n",
+ "2 2 False True 2 \n",
+ "3 4 False True 4 \n",
+ "4 4 False True 4 \n",
+ "\n",
+ " facility_key_id identifier \n",
+ "0 1 c1cecd093146068fd57896e254e98170 \n",
+ "1 1 f510a6710878d7aca36e71c54abab525 \n",
+ "2 1 21177fa9acad1ae2b1f595690fb853d3 \n",
+ "3 1 962601f1eb153d45d49437f8fe839f7f \n",
+ "4 1 bfa22f5a2364a2dacfc45cca1c8d3215 "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = display_databases(name_dataset)\n",
+ "print(\"Number of columns : \", len(df.columns))\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "9fe57873-8108-44c9-b8a5-f58d3cbb6d17",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of columns : 10\n",
+ "Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
+ " 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
+ " dtype='object')\n",
+ "Percent of NA for each column : id 0.000000\n",
+ "season_id 0.000000\n",
+ "facility_id 0.000000\n",
+ "event_type_id 0.000000\n",
+ "event_type_key_id 0.000000\n",
+ "facility_key_id 0.000000\n",
+ "identifier 0.000000\n",
+ "name 0.974026\n",
+ "manual_added 0.000000\n",
+ "is_display 0.000000\n",
+ "dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " season_id | \n",
+ " facility_id | \n",
+ " event_type_id | \n",
+ " event_type_key_id | \n",
+ " facility_key_id | \n",
+ " identifier | \n",
+ " name | \n",
+ " manual_added | \n",
+ " is_display | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 192 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " c1cecd093146068fd57896e254e98170 | \n",
+ " frontières | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 30329 | \n",
+ " 2767 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " f510a6710878d7aca36e71c54abab525 | \n",
+ " visite guidée une autre histoire du monde (1h00) | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 161 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 21177fa9acad1ae2b1f595690fb853d3 | \n",
+ " visite contée les chercheurs d'or indiv | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5957 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 962601f1eb153d45d49437f8fe839f7f | \n",
+ " we dreamt of utopia and we woke up screaming. | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 8337 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " bfa22f5a2364a2dacfc45cca1c8d3215 | \n",
+ " jeff koons épisodes 4 | \n",
+ " False | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id season_id facility_id event_type_id event_type_key_id \\\n",
+ "0 192 16 1 4 4 \n",
+ "1 30329 2767 1 5 5 \n",
+ "2 161 16 1 2 2 \n",
+ "3 5957 582 1 4 4 \n",
+ "4 8337 582 1 4 4 \n",
+ "\n",
+ " facility_key_id identifier \\\n",
+ "0 1 c1cecd093146068fd57896e254e98170 \n",
+ "1 1 f510a6710878d7aca36e71c54abab525 \n",
+ "2 1 21177fa9acad1ae2b1f595690fb853d3 \n",
+ "3 1 962601f1eb153d45d49437f8fe839f7f \n",
+ "4 1 bfa22f5a2364a2dacfc45cca1c8d3215 \n",
+ "\n",
+ " name manual_added is_display \n",
+ "0 frontières False True \n",
+ "1 visite guidée une autre histoire du monde (1h00) False True \n",
+ "2 visite contée les chercheurs d'or indiv False True \n",
+ "3 we dreamt of utopia and we woke up screaming. False True \n",
+ "4 jeff koons épisodes 4 False True "
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = process_df(df)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "7fd9e5bd-baac-4b3b-9ffb-5a9baa18399b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "season_id int64\n",
+ "facility_id int64\n",
+ "event_type_id int64\n",
+ "event_type_key_id int64\n",
+ "facility_key_id int64\n",
+ "identifier object\n",
+ "name object\n",
+ "manual_added bool\n",
+ "is_display bool\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "24186efa-5908-4b03-bf52-96415fc8bd54",
+ "metadata": {},
+ "source": [
+ "#### Deep analysis of event_types.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "90ab62d4-a086-4469-961c-67eefb375388",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "name_dataset = '1event_types.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "58db1751-fd56-4c28-b49e-bc8235bb0dc8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1event_types.csv\n",
+ "Shape : (9, 6)\n",
+ "Number of columns : 6\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " fidelity_delay | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " standard | \n",
+ " 2020-09-03 12:24:22.574262+02:00 | \n",
+ " 2020-09-03 12:24:22.574262+02:00 | \n",
+ " 36 | \n",
+ " c00f0c4675b91fb8b918e4079a0b1bac | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 66 | \n",
+ " package | \n",
+ " 2020-09-03 14:05:04.648137+02:00 | \n",
+ " 2020-09-03 14:05:04.648137+02:00 | \n",
+ " 36 | \n",
+ " efe90a8e604a7c840e88d03a67f6b7d8 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 83 | \n",
+ " guide multimédias | \n",
+ " 2020-09-03 14:15:17.252539+02:00 | \n",
+ " 2020-09-03 14:15:17.252539+02:00 | \n",
+ " 36 | \n",
+ " ee14c62b3b9f6c7dd5401685a18e4460 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " non défini | \n",
+ " 2020-09-03 13:11:23.117024+02:00 | \n",
+ " 2020-09-03 13:11:23.117024+02:00 | \n",
+ " 36 | \n",
+ " 52ff3466787b4d538407372e5f7afe0f | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2723 | \n",
+ " NaN | \n",
+ " 2021-12-22 09:45:47.715105+01:00 | \n",
+ " 2021-12-22 09:45:47.715105+01:00 | \n",
+ " 36 | \n",
+ " d41d8cd98f00b204e9800998ecf8427e | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 1 standard 2020-09-03 12:24:22.574262+02:00 \n",
+ "1 66 package 2020-09-03 14:05:04.648137+02:00 \n",
+ "2 83 guide multimédias 2020-09-03 14:15:17.252539+02:00 \n",
+ "3 3 non défini 2020-09-03 13:11:23.117024+02:00 \n",
+ "4 2723 NaN 2021-12-22 09:45:47.715105+01:00 \n",
+ "\n",
+ " updated_at fidelity_delay \\\n",
+ "0 2020-09-03 12:24:22.574262+02:00 36 \n",
+ "1 2020-09-03 14:05:04.648137+02:00 36 \n",
+ "2 2020-09-03 14:15:17.252539+02:00 36 \n",
+ "3 2020-09-03 13:11:23.117024+02:00 36 \n",
+ "4 2021-12-22 09:45:47.715105+01:00 36 \n",
+ "\n",
+ " identifier \n",
+ "0 c00f0c4675b91fb8b918e4079a0b1bac \n",
+ "1 efe90a8e604a7c840e88d03a67f6b7d8 \n",
+ "2 ee14c62b3b9f6c7dd5401685a18e4460 \n",
+ "3 52ff3466787b4d538407372e5f7afe0f \n",
+ "4 d41d8cd98f00b204e9800998ecf8427e "
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = display_databases(name_dataset)\n",
+ "print(\"Number of columns : \", len(df.columns))\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "ac93382c-0b5f-462d-8021-0dd1e7201b8c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
+ "Percent of NA for each column : id 0.000000\n",
+ "fidelity_delay 0.000000\n",
+ "identifier 0.000000\n",
+ "name 11.111111\n",
+ "dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " fidelity_delay | \n",
+ " identifier | \n",
+ " name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 36 | \n",
+ " c00f0c4675b91fb8b918e4079a0b1bac | \n",
+ " standard | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 66 | \n",
+ " 36 | \n",
+ " efe90a8e604a7c840e88d03a67f6b7d8 | \n",
+ " package | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 83 | \n",
+ " 36 | \n",
+ " ee14c62b3b9f6c7dd5401685a18e4460 | \n",
+ " guide multimédias | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 36 | \n",
+ " 52ff3466787b4d538407372e5f7afe0f | \n",
+ " non défini | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2723 | \n",
+ " 36 | \n",
+ " d41d8cd98f00b204e9800998ecf8427e | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id fidelity_delay identifier name\n",
+ "0 1 36 c00f0c4675b91fb8b918e4079a0b1bac standard\n",
+ "1 66 36 efe90a8e604a7c840e88d03a67f6b7d8 package\n",
+ "2 83 36 ee14c62b3b9f6c7dd5401685a18e4460 guide multimédias\n",
+ "3 3 36 52ff3466787b4d538407372e5f7afe0f non défini\n",
+ "4 2723 36 d41d8cd98f00b204e9800998ecf8427e NaN"
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = process_df(df)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "18cbd630-3c7d-49e1-932b-9460badf3758",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "fidelity_delay int64\n",
+ "identifier object\n",
+ "name object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5847a441-31b9-4802-a5ae-90d8c6d6e153",
+ "metadata": {},
+ "source": [
+ "#### Deep analysis of seasons.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "id": "ae544dcc-f23d-4216-bb5b-597cc1b3765e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "name_dataset = '1seasons.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "1ac97963-9208-4329-be41-d71a5797487f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1seasons.csv\n",
+ "Shape : (13, 6)\n",
+ "Number of columns : 6\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " start_date_time | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 943 | \n",
+ " 2013 | \n",
+ " 2021-07-29 08:55:33.282607+02:00 | \n",
+ " 2021-07-29 08:55:33.282607+02:00 | \n",
+ " NaN | \n",
+ " 8038da89e49ac5eabb489cfc6cea9fc1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 129 | \n",
+ " 2014 | \n",
+ " 2020-09-03 15:13:08.105567+02:00 | \n",
+ " 2020-09-03 15:13:08.105567+02:00 | \n",
+ " NaN | \n",
+ " cee8d6b7ce52554fd70354e37bbf44a2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2015 | \n",
+ " 2020-09-03 13:11:19.405037+02:00 | \n",
+ " 2020-09-03 13:11:19.405037+02:00 | \n",
+ " NaN | \n",
+ " 65d2ea03425887a717c435081cfc5dbb | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2016 | \n",
+ " 2020-09-03 13:11:19.401001+02:00 | \n",
+ " 2020-09-03 13:11:19.401001+02:00 | \n",
+ " NaN | \n",
+ " 95192c98732387165bf8e396c0f2dad2 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 2017 | \n",
+ " 2020-09-03 13:11:19.409005+02:00 | \n",
+ " 2020-09-03 13:11:19.409005+02:00 | \n",
+ " NaN | \n",
+ " 8d8818c8e140c64c743113f563cf750f | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 943 2013 2021-07-29 08:55:33.282607+02:00 \n",
+ "1 129 2014 2020-09-03 15:13:08.105567+02:00 \n",
+ "2 3 2015 2020-09-03 13:11:19.405037+02:00 \n",
+ "3 2 2016 2020-09-03 13:11:19.401001+02:00 \n",
+ "4 4 2017 2020-09-03 13:11:19.409005+02:00 \n",
+ "\n",
+ " updated_at start_date_time \\\n",
+ "0 2021-07-29 08:55:33.282607+02:00 NaN \n",
+ "1 2020-09-03 15:13:08.105567+02:00 NaN \n",
+ "2 2020-09-03 13:11:19.405037+02:00 NaN \n",
+ "3 2020-09-03 13:11:19.401001+02:00 NaN \n",
+ "4 2020-09-03 13:11:19.409005+02:00 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 8038da89e49ac5eabb489cfc6cea9fc1 \n",
+ "1 cee8d6b7ce52554fd70354e37bbf44a2 \n",
+ "2 65d2ea03425887a717c435081cfc5dbb \n",
+ "3 95192c98732387165bf8e396c0f2dad2 \n",
+ "4 8d8818c8e140c64c743113f563cf750f "
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = display_databases(name_dataset)\n",
+ "print(\"Number of columns : \", len(df.columns))\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "b4593d46-105c-47dd-aa71-babd8e63e65b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
+ "Percent of NA for each column : id 0.000000\n",
+ "identifier 0.000000\n",
+ "name 7.692308\n",
+ "start_date_time 100.000000\n",
+ "dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " identifier | \n",
+ " name | \n",
+ " start_date_time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 943 | \n",
+ " 8038da89e49ac5eabb489cfc6cea9fc1 | \n",
+ " 2013 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 129 | \n",
+ " cee8d6b7ce52554fd70354e37bbf44a2 | \n",
+ " 2014 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 65d2ea03425887a717c435081cfc5dbb | \n",
+ " 2015 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 95192c98732387165bf8e396c0f2dad2 | \n",
+ " 2016 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 8d8818c8e140c64c743113f563cf750f | \n",
+ " 2017 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id identifier name start_date_time\n",
+ "0 943 8038da89e49ac5eabb489cfc6cea9fc1 2013 NaN\n",
+ "1 129 cee8d6b7ce52554fd70354e37bbf44a2 2014 NaN\n",
+ "2 3 65d2ea03425887a717c435081cfc5dbb 2015 NaN\n",
+ "3 2 95192c98732387165bf8e396c0f2dad2 2016 NaN\n",
+ "4 4 8d8818c8e140c64c743113f563cf750f 2017 NaN"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = process_df(df)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "5d3b096d-8e73-4514-94e5-f2dcd4d0a89c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "identifier object\n",
+ "name object\n",
+ "start_date_time float64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a7b00bc7-eae6-457c-ac68-a4a55a6d1c8c",
+ "metadata": {},
+ "source": [
+ "#### Deep Analysis of facilities.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "d95ef015-d44c-4353-8761-771b910d21c9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "name_dataset = '1facilities.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "ef5fe794-8df7-4f27-8554-ecdc4074ac0b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1facilities.csv\n",
+ "Shape : (2, 7)\n",
+ "Number of columns : 7\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " street_id | \n",
+ " fixed_capacity | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " non défini | \n",
+ " 2020-09-03 13:16:35.293111+02:00 | \n",
+ " 2020-09-03 13:16:35.293111+02:00 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 52ff3466787b4d538407372e5f7afe0f | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " mucem | \n",
+ " 2020-09-03 13:11:23.133059+02:00 | \n",
+ " 2020-09-03 13:11:23.133059+02:00 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 702bd76fe3dd5dbcf118a6965a946f54 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 2 non défini 2020-09-03 13:16:35.293111+02:00 \n",
+ "1 1 mucem 2020-09-03 13:11:23.133059+02:00 \n",
+ "\n",
+ " updated_at street_id fixed_capacity \\\n",
+ "0 2020-09-03 13:16:35.293111+02:00 2 NaN \n",
+ "1 2020-09-03 13:11:23.133059+02:00 1 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 52ff3466787b4d538407372e5f7afe0f \n",
+ "1 702bd76fe3dd5dbcf118a6965a946f54 "
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = display_databases(name_dataset)\n",
+ "print(\"Number of columns : \", len(df.columns))\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "e3621201-fab9-49fd-95c1-0b9d5da76e50",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n",
+ "Percent of NA for each column : id 0.0\n",
+ "street_id 0.0\n",
+ "identifier 0.0\n",
+ "name 0.0\n",
+ "fixed_capacity 100.0\n",
+ "dtype: float64\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " street_id | \n",
+ " identifier | \n",
+ " name | \n",
+ " fixed_capacity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 52ff3466787b4d538407372e5f7afe0f | \n",
+ " non défini | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 702bd76fe3dd5dbcf118a6965a946f54 | \n",
+ " mucem | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id street_id identifier name fixed_capacity\n",
+ "0 2 2 52ff3466787b4d538407372e5f7afe0f non défini NaN\n",
+ "1 1 1 702bd76fe3dd5dbcf118a6965a946f54 mucem NaN"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = process_df(df)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "1b198b92-8654-4531-a0dd-8f2e01c2e6c1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "street_id int64\n",
+ "identifier object\n",
+ "name object\n",
+ "fixed_capacity float64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ab5c4c2d-3e04-457d-a183-e173df89b650",
+ "metadata": {},
+ "source": [
+ "## Merge"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "id": "43576244-c8cf-4ca0-b056-7aea1fbf0bc7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def process_df_2(df):\n",
+ " df = remove_horodates(df)\n",
+ " print(\"Number of columns : \", len(df.columns))\n",
+ " df = order_columns_id(df)\n",
+ " print(\"Columns : \", df.columns)\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "id": "0fad097e-474c-4af7-b1e1-7d8dda3f09ea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def load_dataset(name):\n",
+ " df = display_databases(name)\n",
+ " df = process_df_2(df)\n",
+ " # drop na :\n",
+ " #df = df.dropna(axis=1, thresh=len(df))\n",
+ " # if identifier in table : delete it\n",
+ " if 'identifier' in df.columns:\n",
+ " df = df.drop(columns = 'identifier')\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b60034ef-fdd6-4640-a012-cf74c17b333f",
+ "metadata": {},
+ "source": [
+ "### Products Table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 84,
+ "id": "6213b1eb-c5f8-49dd-ab69-366542380e80",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_products_table():\n",
+ " # first merge products and categories\n",
+ " print(\"first merge products and categories\")\n",
+ " products = load_dataset(\"1products.csv\")\n",
+ " categories = load_dataset(\"1categories.csv\")\n",
+ " # Drop useless columns\n",
+ " products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n",
+ " categories = categories.drop(columns = ['extra_field', 'quota'])\n",
+ "\n",
+ " #Merge\n",
+ " products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n",
+ " right_on = 'id', suffixes=('_products', '_categories'))\n",
+ " products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n",
+ " \n",
+ " # Second merge products_theme and type of categories\n",
+ " print(\"Second merge products_theme and type of categories\")\n",
+ " type_of_categories = load_dataset(\"1type_of_categories.csv\")\n",
+ " type_of_categories = type_of_categories.drop(columns = 'id')\n",
+ " products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',\n",
+ " right_on = 'category_id' )\n",
+ "\n",
+ " # Index cleaning\n",
+ " products_theme = products_theme.drop(columns = ['id_categories'])\n",
+ " products_theme = order_columns_id(products_theme)\n",
+ "\n",
+ " \n",
+ "\n",
+ " return products_theme"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "id": "b853e020-f73d-44e8-b086-e5548ce21011",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "first merge products and categories\n",
+ "File path : bdc2324-data/1/1products.csv\n",
+ "Shape : (94803, 14)\n",
+ "Number of columns : 12\n",
+ "Columns : Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',\n",
+ " 'products_group_id', 'product_pack_id', 'identifier', 'amount',\n",
+ " 'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],\n",
+ " dtype='object')\n",
+ "File path : bdc2324-data/1/1categories.csv\n",
+ "Shape : (27, 7)\n",
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')\n",
+ "Second merge products_theme and type of categories\n",
+ "File path : bdc2324-data/1/1type_of_categories.csv\n",
+ "Shape : (5, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id_products | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " category_id | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " type_of_id | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " name_categories | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10682 | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 41 | \n",
+ " 10655 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " indiv activité tr | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 478 | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 1 | \n",
+ " 471 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20873 | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 1 | \n",
+ " 20825 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 157142 | \n",
+ " 82519 | \n",
+ " 9 | \n",
+ " 5 | \n",
+ " 156773 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " indiv entrées tr | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1341 | \n",
+ " 9 | \n",
+ " 93 | \n",
+ " 1 | \n",
+ " 1175 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id_products representation_id pricing_formula_id category_id \\\n",
+ "0 10682 914 114 41 \n",
+ "1 478 273 131 1 \n",
+ "2 20873 275 137 1 \n",
+ "3 157142 82519 9 5 \n",
+ "4 1341 9 93 1 \n",
+ "\n",
+ " products_group_id product_pack_id type_of_id amount is_full_price \\\n",
+ "0 10655 1 NaN 9.0 False \n",
+ "1 471 1 12.0 9.5 False \n",
+ "2 20825 1 12.0 11.5 False \n",
+ "3 156773 1 NaN 8.0 False \n",
+ "4 1175 1 12.0 8.5 False \n",
+ "\n",
+ " name_categories \n",
+ "0 indiv activité tr \n",
+ "1 indiv entrées tp \n",
+ "2 indiv entrées tp \n",
+ "3 indiv entrées tr \n",
+ "4 indiv entrées tp "
+ ]
+ },
+ "execution_count": 85,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products_theme = create_products_table()\n",
+ "products_theme.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8bd7b7ab-fd04-48d2-898b-48c5815457f3",
+ "metadata": {},
+ "source": [
+ "### Events Table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "id": "6ed0ad20-8315-4112-9a85-10e5f04ef852",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_events_table():\n",
+ " # first merge events and seasons : \n",
+ " print(\"first merge events and seasons : \")\n",
+ " events = load_dataset(\"1events.csv\")\n",
+ " seasons = load_dataset(\"1seasons.csv\")\n",
+ "\n",
+ " # Drop useless columns\n",
+ " events = events.drop(columns = ['manual_added', 'is_display'])\n",
+ " seasons = seasons.drop(columns = ['start_date_time'])\n",
+ " \n",
+ " events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n",
+ "\n",
+ " # Secondly merge events_theme and event_types\n",
+ " print(\"Secondly merge events_theme and event_types : \")\n",
+ " event_types = load_dataset(\"1event_types.csv\")\n",
+ " event_types = event_types.drop(columns = ['fidelity_delay'])\n",
+ " \n",
+ " events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n",
+ " events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n",
+ " events_theme = events_theme.drop(columns = 'id')\n",
+ "\n",
+ " # thirdly merge events_theme and facilities\n",
+ " print(\"thirdly merge events_theme and facilities : \")\n",
+ " facilities = load_dataset(\"1facilities.csv\")\n",
+ " facilities = facilities.drop(columns = ['fixed_capacity'])\n",
+ " \n",
+ " events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n",
+ " events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n",
+ " events_theme = events_theme.drop(columns = 'id')\n",
+ "\n",
+ " # Index cleaning\n",
+ " events_theme = events_theme.drop(columns = ['id_seasons'])\n",
+ " events_theme = order_columns_id(events_theme)\n",
+ " return events_theme"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "id": "98ef0636-8c45-4a23-a62a-1fbe1544f8ce",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "first merge events and seasons : \n",
+ "File path : bdc2324-data/1/1events.csv\n",
+ "Shape : (1232, 12)\n",
+ "Number of columns : 10\n",
+ "Columns : Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',\n",
+ " 'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],\n",
+ " dtype='object')\n",
+ "File path : bdc2324-data/1/1seasons.csv\n",
+ "Shape : (13, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')\n",
+ "Secondly merge events_theme and event_types : \n",
+ "File path : bdc2324-data/1/1event_types.csv\n",
+ "Shape : (9, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')\n",
+ "thirdly merge events_theme and facilities : \n",
+ "File path : bdc2324-data/1/1facilities.csv\n",
+ "Shape : (2, 7)\n",
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " event_id | \n",
+ " season_id | \n",
+ " facility_id | \n",
+ " event_type_id | \n",
+ " event_type_key_id | \n",
+ " facility_key_id | \n",
+ " street_id | \n",
+ " name_events | \n",
+ " name_seasons | \n",
+ " name_event_types | \n",
+ " name_facilities | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 192 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " frontières | \n",
+ " 2018 | \n",
+ " spectacle vivant | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 30329 | \n",
+ " 2767 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " visite guidée une autre histoire du monde (1h00) | \n",
+ " 2023 | \n",
+ " offre muséale groupe | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 161 | \n",
+ " 16 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " visite contée les chercheurs d'or indiv | \n",
+ " 2018 | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5957 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " we dreamt of utopia and we woke up screaming. | \n",
+ " 2021 | \n",
+ " spectacle vivant | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 8337 | \n",
+ " 582 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " jeff koons épisodes 4 | \n",
+ " 2021 | \n",
+ " spectacle vivant | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " event_id season_id facility_id event_type_id event_type_key_id \\\n",
+ "0 192 16 1 4 4 \n",
+ "1 30329 2767 1 5 5 \n",
+ "2 161 16 1 2 2 \n",
+ "3 5957 582 1 4 4 \n",
+ "4 8337 582 1 4 4 \n",
+ "\n",
+ " facility_key_id street_id \\\n",
+ "0 1 1 \n",
+ "1 1 1 \n",
+ "2 1 1 \n",
+ "3 1 1 \n",
+ "4 1 1 \n",
+ "\n",
+ " name_events name_seasons \\\n",
+ "0 frontières 2018 \n",
+ "1 visite guidée une autre histoire du monde (1h00) 2023 \n",
+ "2 visite contée les chercheurs d'or indiv 2018 \n",
+ "3 we dreamt of utopia and we woke up screaming. 2021 \n",
+ "4 jeff koons épisodes 4 2021 \n",
+ "\n",
+ " name_event_types name_facilities \n",
+ "0 spectacle vivant mucem \n",
+ "1 offre muséale groupe mucem \n",
+ "2 offre muséale individuel mucem \n",
+ "3 spectacle vivant mucem \n",
+ "4 spectacle vivant mucem "
+ ]
+ },
+ "execution_count": 87,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "events_theme= create_events_table()\n",
+ "events_theme.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4ad5b680-bb27-4f86-a5f3-7ff4fd1be96a",
+ "metadata": {},
+ "source": [
+ "## Representations_Table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "id": "481dddd6-80a8-4b9e-a05e-ed06fa3ed7a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_representations_table():\n",
+ " representations = load_dataset(\"1representations.csv\")\n",
+ " representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n",
+ " 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n",
+ " 'representation_type_id'])\n",
+ " \n",
+ " representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n",
+ " representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n",
+ "\n",
+ " representations_theme = representations.merge(representations_capacity, how='left',\n",
+ " left_on='id', right_on='representation_id',\n",
+ " suffixes=('_representation', '_representation_cap'))\n",
+ " # index cleaning\n",
+ " representations_theme = representations_theme.drop(columns = [\"id_representation\"])\n",
+ " representations_theme = order_columns_id(representations_theme)\n",
+ " return representations_theme"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "id": "677f4ed8-ef58-45f2-9056-ede0898c6a64",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1representations.csv\n",
+ "Shape : (36095, 16)\n",
+ "Number of columns : 14\n",
+ "Columns : Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',\n",
+ " 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
+ " 'is_display', 'expected_filling', 'max_filling', 'extra_field'],\n",
+ " dtype='object')\n",
+ "File path : bdc2324-data/1/1representation_category_capacities.csv\n",
+ "Shape : (65241, 7)\n",
+ "Number of columns : 5\n",
+ "Columns : Index(['id', 'representation_id', 'category_id', 'expected_filling',\n",
+ " 'max_filling'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " event_id | \n",
+ " id_representation_cap | \n",
+ " representation_id | \n",
+ " category_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 12384 | \n",
+ " 123058 | \n",
+ " 84820 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 37 | \n",
+ " 2514 | \n",
+ " 269 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 37 | \n",
+ " 384 | \n",
+ " 269 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 37 | \n",
+ " 2515 | \n",
+ " 269 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 37 | \n",
+ " 383 | \n",
+ " 269 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " event_id id_representation_cap representation_id category_id\n",
+ "0 12384 123058 84820 2\n",
+ "1 37 2514 269 2\n",
+ "2 37 384 269 5\n",
+ "3 37 2515 269 10\n",
+ "4 37 383 269 1"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "representation_theme = create_representations_table()\n",
+ "representation_theme.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e274e3cc-1b41-43e0-8412-1563166060cb",
+ "metadata": {},
+ "source": [
+ "## Price Table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "id": "c52621e7-01de-48dc-b572-2974542a8be5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1product_packs.csv\n",
+ "Shape : (1, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'identifier', 'name', 'type_of'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " type_of | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name type_of\n",
+ "0 1 NaN 0"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "product_packs = load_dataset(\"1product_packs.csv\")\n",
+ "product_packs.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 114,
+ "id": "9e4f60ab-9a2c-4090-b0c4-f9a1530b2d39",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1pricing_formulas.csv\n",
+ "Shape : (556, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'identifier', 'name', 'extra_field'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " extra_field | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 41909 | \n",
+ " visite mécènes 1h30 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 502 | \n",
+ " entree mucem tp( expo picasso) | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 504 | \n",
+ " nombre de personnes cinema | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 117 | \n",
+ " spectacle tarif e famille tr | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1496 | \n",
+ " billet nb famille mecene 1a | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name extra_field\n",
+ "0 41909 visite mécènes 1h30 NaN\n",
+ "1 502 entree mucem tp( expo picasso) NaN\n",
+ "2 504 nombre de personnes cinema NaN\n",
+ "3 117 spectacle tarif e famille tr NaN\n",
+ "4 1496 billet nb famille mecene 1a NaN"
+ ]
+ },
+ "execution_count": 114,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pricing_formula = load_dataset(\"1pricing_formulas.csv\")\n",
+ "pricing_formula.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 115,
+ "id": "247b5c45-a18a-4cfd-86b4-d3453e157bcd",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1type_of_pricing_formulas.csv\n",
+ "Shape : (568, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'type_of_id', 'pricing_formula_id', 'identifier'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " type_of_id | \n",
+ " pricing_formula_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 127 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2425 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2937 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 48 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id type_of_id pricing_formula_id\n",
+ "0 1 1 127\n",
+ "1 2 1 2425\n",
+ "2 3 1 2937\n",
+ "3 4 1 48\n",
+ "4 5 1 7"
+ ]
+ },
+ "execution_count": 115,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "type_pricing_formula = load_dataset(\"1type_of_pricing_formulas.csv\")\n",
+ "type_pricing_formula.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "id": "4b48f7b3-0f06-4ef6-9355-5016af82f49c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1products_groups.csv\n",
+ "Shape : (92973, 9)\n",
+ "Number of columns : 7\n",
+ "Columns : Index(['id', 'category_id', 'pricing_formula_id', 'representation_id',\n",
+ " 'percent_price', 'max_price', 'min_price'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " category_id | \n",
+ " pricing_formula_id | \n",
+ " representation_id | \n",
+ " percent_price | \n",
+ " max_price | \n",
+ " min_price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2735 | \n",
+ " 8 | \n",
+ " 97 | \n",
+ " 1534 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 156773 | \n",
+ " 5 | \n",
+ " 9 | \n",
+ " 82519 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 14387 | \n",
+ " 16 | \n",
+ " 79 | \n",
+ " 8046 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2770 | \n",
+ " 2 | \n",
+ " 37 | \n",
+ " 1563 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 27179 | \n",
+ " 13 | \n",
+ " 119 | \n",
+ " 14192 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id category_id pricing_formula_id representation_id percent_price \\\n",
+ "0 2735 8 97 1534 100.0 \n",
+ "1 156773 5 9 82519 100.0 \n",
+ "2 14387 16 79 8046 100.0 \n",
+ "3 2770 2 37 1563 100.0 \n",
+ "4 27179 13 119 14192 100.0 \n",
+ "\n",
+ " max_price min_price \n",
+ "0 0.0 0.0 \n",
+ "1 0.0 0.0 \n",
+ "2 0.0 0.0 \n",
+ "3 0.0 0.0 \n",
+ "4 0.0 0.0 "
+ ]
+ },
+ "execution_count": 117,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "product_groups = load_dataset(\"1products_groups.csv\")\n",
+ "product_groups.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71c26a38-6818-42df-8aee-0135681a5563",
+ "metadata": {},
+ "source": [
+ "## Uniform Products theme database"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "id": "b26f4e7e-134d-4e32-a615-4b0e6bb80b25",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
+ " 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n",
+ " 'is_full_price', 'name_categories'],\n",
+ " dtype='object')\n",
+ "\n",
+ " Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n",
+ " 'category_id'],\n",
+ " dtype='object')\n",
+ "\n",
+ " Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n",
+ " 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n",
+ " 'name_seasons', 'name_event_types', 'name_facilities'],\n",
+ " dtype='object')\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Products theme columns : \", products_theme.columns)\n",
+ "print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
+ "print(\"\\n Events theme columns : \", events_theme.columns)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
+ "id": "d40b1e3b-b1f3-4915-8ebc-6bb7856da42a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id_products | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " category_id | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " type_of_id | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " name_categories | \n",
+ " event_id | \n",
+ " id_representation_cap | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10682 | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 41 | \n",
+ " 10655 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " indiv activité tr | \n",
+ " 132 | \n",
+ " 8789 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 478 | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 1 | \n",
+ " 471 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
+ " 37 | \n",
+ " 390 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20873 | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 1 | \n",
+ " 20825 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
+ " 37 | \n",
+ " 395 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 157142 | \n",
+ " 82519 | \n",
+ " 9 | \n",
+ " 5 | \n",
+ " 156773 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " indiv entrées tr | \n",
+ " 12365 | \n",
+ " 120199 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1341 | \n",
+ " 9 | \n",
+ " 93 | \n",
+ " 1 | \n",
+ " 1175 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
+ " 8 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id_products representation_id pricing_formula_id category_id \\\n",
+ "0 10682 914 114 41 \n",
+ "1 478 273 131 1 \n",
+ "2 20873 275 137 1 \n",
+ "3 157142 82519 9 5 \n",
+ "4 1341 9 93 1 \n",
+ "\n",
+ " products_group_id product_pack_id type_of_id amount is_full_price \\\n",
+ "0 10655 1 NaN 9.0 False \n",
+ "1 471 1 12.0 9.5 False \n",
+ "2 20825 1 12.0 11.5 False \n",
+ "3 156773 1 NaN 8.0 False \n",
+ "4 1175 1 12.0 8.5 False \n",
+ "\n",
+ " name_categories event_id id_representation_cap \n",
+ "0 indiv activité tr 132 8789 \n",
+ "1 indiv entrées tp 37 390 \n",
+ "2 indiv entrées tp 37 395 \n",
+ "3 indiv entrées tr 12365 120199 \n",
+ "4 indiv entrées tp 8 21 "
+ ]
+ },
+ "execution_count": 99,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products_global = products_theme.merge(representation_theme, how='left',\n",
+ " on= [\"representation_id\", \"category_id\"])\n",
+ "\n",
+ "\n",
+ "products_global.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "id": "78d75a08-e959-429c-847a-7d70a2804806",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id_products | \n",
+ " representation_id | \n",
+ " pricing_formula_id | \n",
+ " category_id | \n",
+ " products_group_id | \n",
+ " product_pack_id | \n",
+ " type_of_id | \n",
+ " event_id | \n",
+ " id_representation_cap | \n",
+ " season_id | \n",
+ " ... | \n",
+ " event_type_key_id | \n",
+ " facility_key_id | \n",
+ " street_id | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " name_categories | \n",
+ " name_events | \n",
+ " name_seasons | \n",
+ " name_event_types | \n",
+ " name_facilities | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10682 | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 41 | \n",
+ " 10655 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 132 | \n",
+ " 8789 | \n",
+ " 4 | \n",
+ " ... | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " indiv activité tr | \n",
+ " visite-jeu \"le classico des minots\" (1h30) | \n",
+ " 2017 | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 478 | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 1 | \n",
+ " 471 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 37 | \n",
+ " 390 | \n",
+ " 2 | \n",
+ " ... | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
+ " billet mucem picasso | \n",
+ " 2016 | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20873 | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 1 | \n",
+ " 20825 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 37 | \n",
+ " 395 | \n",
+ " 2 | \n",
+ " ... | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
+ " billet mucem picasso | \n",
+ " 2016 | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 157142 | \n",
+ " 82519 | \n",
+ " 9 | \n",
+ " 5 | \n",
+ " 156773 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 12365 | \n",
+ " 120199 | \n",
+ " 1754 | \n",
+ " ... | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " indiv entrées tr | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " offre muséale individuel | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1341 | \n",
+ " 9 | \n",
+ " 93 | \n",
+ " 1 | \n",
+ " 1175 | \n",
+ " 1 | \n",
+ " 12.0 | \n",
+ " 8 | \n",
+ " 21 | \n",
+ " 4 | \n",
+ " ... | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " indiv entrées tp | \n",
+ " non défini | \n",
+ " 2017 | \n",
+ " non défini | \n",
+ " mucem | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 22 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id_products representation_id pricing_formula_id category_id \\\n",
+ "0 10682 914 114 41 \n",
+ "1 478 273 131 1 \n",
+ "2 20873 275 137 1 \n",
+ "3 157142 82519 9 5 \n",
+ "4 1341 9 93 1 \n",
+ "\n",
+ " products_group_id product_pack_id type_of_id event_id \\\n",
+ "0 10655 1 NaN 132 \n",
+ "1 471 1 12.0 37 \n",
+ "2 20825 1 12.0 37 \n",
+ "3 156773 1 NaN 12365 \n",
+ "4 1175 1 12.0 8 \n",
+ "\n",
+ " id_representation_cap season_id ... event_type_key_id facility_key_id \\\n",
+ "0 8789 4 ... 5 1 \n",
+ "1 390 2 ... 2 1 \n",
+ "2 395 2 ... 2 1 \n",
+ "3 120199 1754 ... 4 1 \n",
+ "4 21 4 ... 6 1 \n",
+ "\n",
+ " street_id amount is_full_price name_categories \\\n",
+ "0 1 9.0 False indiv activité tr \n",
+ "1 1 9.5 False indiv entrées tp \n",
+ "2 1 11.5 False indiv entrées tp \n",
+ "3 1 8.0 False indiv entrées tr \n",
+ "4 1 8.5 False indiv entrées tp \n",
+ "\n",
+ " name_events name_seasons \\\n",
+ "0 visite-jeu \"le classico des minots\" (1h30) 2017 \n",
+ "1 billet mucem picasso 2016 \n",
+ "2 billet mucem picasso 2016 \n",
+ "3 NaN NaN \n",
+ "4 non défini 2017 \n",
+ "\n",
+ " name_event_types name_facilities \n",
+ "0 offre muséale individuel mucem \n",
+ "1 offre muséale individuel mucem \n",
+ "2 offre muséale individuel mucem \n",
+ "3 offre muséale individuel mucem \n",
+ "4 non défini mucem \n",
+ "\n",
+ "[5 rows x 22 columns]"
+ ]
+ },
+ "execution_count": 100,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
+ " suffixes = (\"_representation\", \"_event\"))\n",
+ "products_global = order_columns_id(products_global)\n",
+ "products_global.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "id": "4a6950e8-4818-4df2-afa9-562e0921698c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n",
+ " 'products_group_id', 'product_pack_id', 'type_of_id', 'event_id',\n",
+ " 'id_representation_cap', 'season_id', 'facility_id', 'event_type_id',\n",
+ " 'event_type_key_id', 'facility_key_id', 'street_id', 'amount',\n",
+ " 'is_full_price', 'name_categories', 'name_events', 'name_seasons',\n",
+ " 'name_event_types', 'name_facilities'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 101,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products_global.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 102,
+ "id": "b18f6428-90e0-4b1b-9b8d-bad995fb6c98",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(94803, 22)"
+ ]
+ },
+ "execution_count": 102,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products_global.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3caf2fd-178e-48e9-b95f-5798bd576f5d",
+ "metadata": {},
+ "source": [
+ "## Analysis of Products_global"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "id": "33ee07a2-d871-4436-9860-9be389bc4902",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id_products 0\n",
+ "representation_id 0\n",
+ "pricing_formula_id 0\n",
+ "category_id 0\n",
+ "products_group_id 0\n",
+ "product_pack_id 0\n",
+ "type_of_id 67589\n",
+ "event_id 0\n",
+ "id_representation_cap 0\n",
+ "season_id 0\n",
+ "facility_id 0\n",
+ "event_type_id 0\n",
+ "event_type_key_id 0\n",
+ "facility_key_id 0\n",
+ "street_id 0\n",
+ "amount 0\n",
+ "is_full_price 0\n",
+ "name_categories 3991\n",
+ "name_events 46657\n",
+ "name_seasons 30663\n",
+ "name_event_types 0\n",
+ "name_facilities 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 103,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "products_global.isna().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 105,
+ "id": "557fc475-4417-4d9f-8d4e-8c49bc42367f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['offre muséale individuel', 'non défini', 'spectacle vivant',\n",
+ " 'offre muséale groupe', 'formule adhésion'], dtype=object)"
+ ]
+ },
+ "execution_count": 105,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many event types ?\n",
+ "\n",
+ "products_global['name_event_types'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 107,
+ "id": "a9b9a23c-b0de-4685-97e5-d52dd78349f5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "644"
+ ]
+ },
+ "execution_count": 107,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many events ?\n",
+ "\n",
+ "len(products_global['name_events'].unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 108,
+ "id": "fb374c72-58ca-404d-a86b-e834a2fc4a34",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array(['indiv activité tr', 'indiv entrées tp', 'indiv entrées tr',\n",
+ " 'indiv prog enfant', 'indiv activité gr', 'indiv prog gr',\n",
+ " 'indiv activité tp', 'indiv activité enfant', 'indiv entrées gr',\n",
+ " 'groupe forfait entrées tr', 'groupe autonome adulte',\n",
+ " 'indiv prog tp', 'indiv prog tr', 'indiv entrées fa',\n",
+ " 'groupe forfait scolaire', 'en nb entrées tr', 'non défini', nan,\n",
+ " 'en nb entrées gr', 'groupe autonome entrées gr',\n",
+ " 'groupe forfait entrées gr', 'groupe autonome entrées tr',\n",
+ " 'en nb entrées tp', 'groupe autonome gr',\n",
+ " 'groupe autonome entrées tp', 'groupe forfait adulte',\n",
+ " 'groupe forfait etudiant'], dtype=object)"
+ ]
+ },
+ "execution_count": 108,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# how many categories ?\n",
+ "products_global['name_categories'].unique()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 109,
+ "id": "11f89771-8d50-4ef4-b34e-53e4f6b419bb",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "27"
+ ]
+ },
+ "execution_count": 109,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "len(products_global['category_id'].unique())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8add1ff2-b7e8-4381-90d8-d18d8660ed39",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def uniform_product_df():\n",
+ " \"\"\"\n",
+ " This function returns the uniform product dataset\n",
+ " \"\"\"\n",
+ " print(\"Products theme columns : \", products_theme.columns)\n",
+ " print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
+ " print(\"\\n Events theme columns : \", events_theme.columns)\n",
+ "\n",
+ " products_global = products_theme.merge(representation_theme, how='left',\n",
+ " on= [\"representation_id\", \"category_id\"])\n",
+ " \n",
+ " products_global = products_global.merge(events_theme, how='left', on='event_id',\n",
+ " suffixes = (\"_representation\", \"_event\"))\n",
+ " \n",
+ " products_global = order_columns_id(products_global)\n",
+ "\n",
+ " # remove useless columns \n",
+ " products_global = products_global.drop(columns = ['type_of_id', 'name_events', 'name_seasons', 'name_categories'])\n",
+ " return products_global\n",
+ " "
+ ]
}
],
"metadata": {