2024-02-05 19:29:12 +01:00
11 changed files with 9416 additions and 5042 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,187 @@
-.ipynb_checkpoints/Clean-Notebook-checkpoint.ipynb
+# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
+# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,python
+
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+
+# IPython
+profile_default/
+ipython_config.py
+
+# Remove previous ipynb_checkpoints
+#   git rm -r .ipynb_checkpoints/
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+
+# IPython
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
--- a/.gitignore.txt
+++ b/.gitignore.txt
@ -1,187 +0,0 @@
-# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
-# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,python
-
-### JupyterNotebooks ###
-# gitignore template for Jupyter Notebooks
-# website: http://jupyter.org/
-
-.ipynb_checkpoints
-*/.ipynb_checkpoints/*
-
-# IPython
-profile_default/
-ipython_config.py
-
-# Remove previous ipynb_checkpoints
-#   git rm -r .ipynb_checkpoints/
-
-### Python ###
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-
-# IPython
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
-### Python Patch ###
-# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
-poetry.toml
-
-# ruff
-.ruff_cache/
-
-# LSP config files
-pyrightconfig.json
-
-# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
--- a/.ipynb_checkpoints/Notebook_AJ-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Notebook_AJ-checkpoint.ipynb
@ -1,76 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['bdc2324-data/1',\n",
-       " 'bdc2324-data/10',\n",
-       " 'bdc2324-data/101',\n",
-       " 'bdc2324-data/11',\n",
-       " 'bdc2324-data/12',\n",
-       " 'bdc2324-data/13',\n",
-       " 'bdc2324-data/14',\n",
-       " 'bdc2324-data/2',\n",
-       " 'bdc2324-data/3',\n",
-       " 'bdc2324-data/4',\n",
-       " 'bdc2324-data/5',\n",
-       " 'bdc2324-data/6',\n",
-       " 'bdc2324-data/7',\n",
-       " 'bdc2324-data/8',\n",
-       " 'bdc2324-data/9']"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import s3fs\n",
-    "\n",
-    "# Create filesystem object\n",
-    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
-    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
-    "\n",
-    "BUCKET = \"bdc2324-data\"\n",
-    "fs.ls(BUCKET)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "023bfa2b-97c2-4d53-80fb-e2290c73b92f",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/0_Cleaning_and_merge.ipynb
+++ b/0_Cleaning_and_merge.ipynb
--- a/Brouillon_AJ.ipynb
+++ b/Brouillon_AJ.ipynb
@ -0,0 +1,695 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
+   "metadata": {},
+   "source": [
+    "# Business Data Challenge - Team 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
+   "metadata": {},
+   "source": [
+    "Configuration de l'accès aux données"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import s3fs\n",
+    "# Create filesystem object\n",
+    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
+    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
+    "\n",
+    "BUCKET = \"bdc2324-data\"\n",
+    "fs.ls(BUCKET)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Chargement des fichiers campaign_stats.csv\n",
+    "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Conversion des dates 'sent_at'\n",
+    "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
+    "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
+    "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
+    "print(campaign_stats_1['sent_at'].max())\n",
+    "print(campaign_stats_1['sent_at'].min())\n",
+    "\n",
+    "print(campaign_stats_2['sent_at'].max())\n",
+    "print(campaign_stats_2['sent_at'].min())\n",
+    "\n",
+    "print(campaign_stats_3['sent_at'].max())\n",
+    "print(campaign_stats_3['sent_at'].min())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "campaign_stats_1['sent_at']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31f2edbf-5661-4516-9835-06d4da615c13",
+   "metadata": {},
+   "source": [
+    "### Customersplus.csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "460f853a-68c0-42a7-9877-b83d3aaec813",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "customers_plus_1.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "customers_plus_1.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "customers_plus_1['id'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "customers_plus_2['id'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b40a653e-013f-48d0-8b57-0284587b36c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32fa2215-3c79-40b5-8643-755865959fc7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
+    "# Exemple id commun = caractéristiques communes\n",
+    "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
+    "\n",
+    "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "customers_plus_1.isna().mean()*100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f6ce60d-0912-497d-9108-330acccef394",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Chargement de toutes les données\n",
+    "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
+    "\n",
+    "for nom_base in liste_base:\n",
+    "    FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
+    "    with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "        globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Jointure\n",
+    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
+    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
+    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
+    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
+    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
+    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
+    "df_customer_event"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f",
+   "metadata": {},
+   "source": [
+    "# Fusion et exploration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22bfad2b-d52a-4077-9b39-bee35004e01c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Jointure\n",
+    "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
+    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
+    "\n",
+    "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
+    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
+    "\n",
+    "var_choosed.remove('representation_id')\n",
+    "var_choosed.extend(['start_date_time', 'event_id'])\n",
+    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
+    "\n",
+    "var_choosed.remove('event_id')\n",
+    "var_choosed.extend(['name', 'customer_id'])\n",
+    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
+    "\n",
+    "# Changement de nom\n",
+    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
+    "var_choosed[var_choosed.index('name')] = \"event_name\"\n",
+    "\n",
+    "# Base finale\n",
+    "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
+    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
+    "df_customer_event"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31",
+   "metadata": {},
+   "source": [
+    "## Type de client au globale"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Client\n",
+    "print(customer_target_mappings.columns)\n",
+    "print(customer_target_mappings.shape)\n",
+    "customer_target_mappings.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "customer_target_mappings['extra_field'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "customer_target_mappings['name'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Segmentation existante\n",
+    "print(target_types.columns)\n",
+    "print(target_types.shape)\n",
+    "target_types.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5adb1773-648d-4683-bc08-d1f2298c1283",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "target_types"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d65f74e-47fc-4296-b493-a1ebefb91cde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tags = clients\n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    tags = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(tags.columns)\n",
+    "print(tags.shape)\n",
+    "tags.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a689a63-165b-4c4e-bbb0-695b661048d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tags"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69e38c52-0570-4531-aebb-9deb6db8c40b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Structure = clients\n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(structure_tag_mappings.columns)\n",
+    "print(structure_tag_mappings.shape)\n",
+    "structure_tag_mappings.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74dc34ad-375b-48df-a900-40d92c5fff13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "structure_tag_mappings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Tags = clients\n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    customersplus = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(customersplus.columns)\n",
+    "print(customersplus.shape)\n",
+    "customersplus.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "383e892c-606a-45ce-bdd6-b503b3e0be33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "customersplus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70324d06-b855-4386-a7de-eef1eb13dfdf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# tickets\n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    tickets = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(tickets.columns)\n",
+    "print(tickets.shape)\n",
+    "tickets.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea83ea5c-3d47-4a66-a523-04b69b149a20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tickets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tickets['type_of'].unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc192b08-30a5-486a-8bea-93e765dbfce6",
+   "metadata": {},
+   "source": [
+    "## Types d'évenement et client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e14dcf62-2def-4ed5-834b-cf21abbc2894",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evenement = events.csv\n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    events = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(events.columns)\n",
+    "print(events.shape)\n",
+    "events.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "events"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "af80eee8-f717-4159-a0fd-09d47ec96621",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "events['name'].nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Représentation des évenements = representations.csv\n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    representations = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(representations.columns)\n",
+    "print(representations.shape)\n",
+    "representations.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1487402a-a49b-4737-b7d7-40c764d2f0b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "representations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99b27418-2c15-4a6e-bcf5-d329ca492085",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Produits vendues = products.csv\n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    products = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(products.columns)\n",
+    "print(products.shape)\n",
+    "products.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49bcd47-672f-4e0f-aee9-a7475151b97f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "products"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4aec5ce-d0c9-4625-bb29-9ac154818621",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lieu = facilities.csv\n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    facilities = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(facilities.columns)\n",
+    "print(facilities.shape)\n",
+    "facilities.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3642483-2879-442a-ad69-efcd2331a200",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "facilities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da1e9807-2a8d-4be7-a785-55cffd734f36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Saisons = seasons.csv période sur deux années consécutives\n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    seasons = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(seasons.columns)\n",
+    "print(seasons.shape)\n",
+    "seasons.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seasons['name'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abb3aa20-774b-4761-983a-df5eb2bc51c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Achats = purchases.csv \n",
+    "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
+    "\n",
+    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+    "    purchases = pd.read_csv(file_in, sep=\",\")\n",
+    "\n",
+    "print(purchases.columns)\n",
+    "print(purchases.shape)\n",
+    "purchases.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30e204ab-4f63-430c-a818-5c8035b6e17b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "purchases"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/Clean-Notebook.ipynb
+++ b/Clean-Notebook.ipynb
--- a/Exploration_billet_AJ.ipynb
+++ b/Exploration_billet_AJ.ipynb
--- a/Notebook_AJ.ipynb
+++ b/Notebook_AJ.ipynb
@ -1,823 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
-   "metadata": {},
-   "source": [
-    "# Business Data Challenge - Team 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
-   "metadata": {},
-   "source": [
-    "Configuration de l'accès aux données"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['bdc2324-data/1',\n",
-       " 'bdc2324-data/10',\n",
-       " 'bdc2324-data/101',\n",
-       " 'bdc2324-data/11',\n",
-       " 'bdc2324-data/12',\n",
-       " 'bdc2324-data/13',\n",
-       " 'bdc2324-data/14',\n",
-       " 'bdc2324-data/2',\n",
-       " 'bdc2324-data/3',\n",
-       " 'bdc2324-data/4',\n",
-       " 'bdc2324-data/5',\n",
-       " 'bdc2324-data/6',\n",
-       " 'bdc2324-data/7',\n",
-       " 'bdc2324-data/8',\n",
-       " 'bdc2324-data/9']"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import s3fs\n",
-    "# Create filesystem object\n",
-    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
-    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
-    "\n",
-    "BUCKET = \"bdc2324-data\"\n",
-    "fs.ls(BUCKET)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chargement des fichiers campaign_stats.csv\n",
-    "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Conversion des dates 'sent_at'\n",
-    "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
-    "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
-    "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2023-11-09 18:10:45+00:00\n",
-      "2020-06-02 08:24:08+00:00\n",
-      "2023-10-12 01:39:48+00:00\n",
-      "2023-10-10 17:06:29+00:00\n",
-      "2023-11-01 09:20:48+00:00\n",
-      "2021-03-31 14:59:02+00:00\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
-    "print(campaign_stats_1['sent_at'].max())\n",
-    "print(campaign_stats_1['sent_at'].min())\n",
-    "\n",
-    "print(campaign_stats_2['sent_at'].max())\n",
-    "print(campaign_stats_2['sent_at'].min())\n",
-    "\n",
-    "print(campaign_stats_3['sent_at'].max())\n",
-    "print(campaign_stats_3['sent_at'].min())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0         2021-03-28 16:01:09+00:00\n",
-       "1         2021-03-28 16:01:09+00:00\n",
-       "2         2021-03-28 16:00:59+00:00\n",
-       "3         2021-03-28 16:00:59+00:00\n",
-       "4         2021-03-28 16:01:06+00:00\n",
-       "                     ...           \n",
-       "6214803   2023-10-23 09:32:33+00:00\n",
-       "6214804   2023-10-23 09:32:49+00:00\n",
-       "6214805   2023-10-23 09:33:28+00:00\n",
-       "6214806   2023-10-23 09:31:53+00:00\n",
-       "6214807   2023-10-23 09:33:54+00:00\n",
-       "Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "campaign_stats_1['sent_at']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "31f2edbf-5661-4516-9835-06d4da615c13",
-   "metadata": {},
-   "source": [
-    "### Customersplus.csv"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-      "  customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
-     ]
-    }
-   ],
-   "source": [
-    "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
-    "\n",
-    "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
-    "\n",
-    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "    customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "460f853a-68c0-42a7-9877-b83d3aaec813",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
-       "       'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
-       "       'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
-       "       'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
-       "       'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
-       "       'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
-       "       'average_purchase_delay', 'average_price_basket',\n",
-       "       'average_ticket_basket', 'total_price', 'preferred_category',\n",
-       "       'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
-       "       'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
-       "       'tenant_id'],\n",
-       "      dtype='object')"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "customers_plus_1.columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_1['id'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "customers_plus_2['id'].nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b40a653e-013f-48d0-8b57-0284587b36c5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 61,
-   "id": "32fa2215-3c79-40b5-8643-755865959fc7",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1"
-      ]
-     },
-     "execution_count": 61,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
-    "# Exemple id commun = caractéristiques communes\n",
-    "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
-    "\n",
-    "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "id                          0.000000\n",
-      "lastname                   43.461341\n",
-      "firstname                  44.995588\n",
-      "birthdate                  96.419870\n",
-      "email                       8.622075\n",
-      "street_id                   0.000000\n",
-      "created_at                  0.000000\n",
-      "updated_at                  0.000000\n",
-      "civility                  100.000000\n",
-      "is_partner                  0.000000\n",
-      "extra                     100.000000\n",
-      "deleted_at                100.000000\n",
-      "reference                 100.000000\n",
-      "gender                      0.000000\n",
-      "is_email_true               0.000000\n",
-      "extra_field               100.000000\n",
-      "identifier                  0.000000\n",
-      "opt_in                      0.000000\n",
-      "structure_id               88.072380\n",
-      "note                       99.403421\n",
-      "profession                 95.913503\n",
-      "language                   99.280945\n",
-      "mcp_contact_id             34.876141\n",
-      "need_reload                 0.000000\n",
-      "last_buying_date           51.653431\n",
-      "max_price                  51.653431\n",
-      "ticket_sum                  0.000000\n",
-      "average_price               8.639195\n",
-      "fidelity                    0.000000\n",
-      "average_purchase_delay     51.653431\n",
-      "average_price_basket       51.653431\n",
-      "average_ticket_basket      51.653431\n",
-      "total_price                43.014236\n",
-      "preferred_category        100.000000\n",
-      "preferred_supplier        100.000000\n",
-      "preferred_formula         100.000000\n",
-      "purchase_count              0.000000\n",
-      "first_buying_date          51.653431\n",
-      "last_visiting_date        100.000000\n",
-      "zipcode                    71.176564\n",
-      "country                     5.459418\n",
-      "age                        96.419870\n",
-      "tenant_id                   0.000000\n",
-      "dtype: float64\n"
-     ]
-    }
-   ],
-   "source": [
-    "pd.DataFrame(customers_plus_1.isna().mean()*100)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "6f6ce60d-0912-497d-9108-330acccef394",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Chargement de toutes les données\n",
-    "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
-    "\n",
-    "for nom_base in liste_base:\n",
-    "    FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
-    "    with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
-    "        globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>id</th>\n",
-       "      <th>lastname</th>\n",
-       "      <th>firstname</th>\n",
-       "      <th>birthdate</th>\n",
-       "      <th>email</th>\n",
-       "      <th>street_id</th>\n",
-       "      <th>created_at</th>\n",
-       "      <th>updated_at</th>\n",
-       "      <th>civility</th>\n",
-       "      <th>is_partner</th>\n",
-       "      <th>...</th>\n",
-       "      <th>tenant_id</th>\n",
-       "      <th>id_x</th>\n",
-       "      <th>customer_id</th>\n",
-       "      <th>purchase_date</th>\n",
-       "      <th>type_of</th>\n",
-       "      <th>is_from_subscription</th>\n",
-       "      <th>amount</th>\n",
-       "      <th>is_full_price</th>\n",
-       "      <th>start_date_time</th>\n",
-       "      <th>event_name</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>405082</td>\n",
-       "      <td>lastname405082</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
-       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>992423</td>\n",
-       "      <td>405082</td>\n",
-       "      <td>2023-01-11 17:08:41+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>13.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2023-02-06 20:00:00+01:00</td>\n",
-       "      <td>zaide</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>405082</td>\n",
-       "      <td>lastname405082</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
-       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>992423</td>\n",
-       "      <td>405082</td>\n",
-       "      <td>2023-01-11 17:08:41+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>13.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2023-02-06 20:00:00+01:00</td>\n",
-       "      <td>zaide</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>411168</td>\n",
-       "      <td>lastname411168</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
-       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1053934</td>\n",
-       "      <td>411168</td>\n",
-       "      <td>2023-03-16 16:23:10+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>62.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2023-03-19 16:00:00+01:00</td>\n",
-       "      <td>luisa miller</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>411168</td>\n",
-       "      <td>lastname411168</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
-       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1053934</td>\n",
-       "      <td>411168</td>\n",
-       "      <td>2023-03-16 16:23:10+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>62.0</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2023-03-19 16:00:00+01:00</td>\n",
-       "      <td>luisa miller</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4380</td>\n",
-       "      <td>lastname4380</td>\n",
-       "      <td>firstname4380</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1</td>\n",
-       "      <td>2021-04-22 14:51:55.432952+02:00</td>\n",
-       "      <td>2022-04-14 11:41:33.738500+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1189141</td>\n",
-       "      <td>4380</td>\n",
-       "      <td>2020-11-26 13:12:53+01:00</td>\n",
-       "      <td>3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>51.3</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2020-12-01 20:00:00+01:00</td>\n",
-       "      <td>iphigenie en tauride</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318964</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1090839</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-05-19 21:18:36+02:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>4.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2019-05-27 20:00:00+02:00</td>\n",
-       "      <td>entre femmes</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318965</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1090839</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-05-19 21:18:36+02:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>4.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2019-05-27 20:00:00+02:00</td>\n",
-       "      <td>entre femmes</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318966</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1090839</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-05-19 21:18:36+02:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>4.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2019-05-27 20:00:00+02:00</td>\n",
-       "      <td>entre femmes</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318967</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1244277</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-12-31 11:04:07+01:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>5.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2020-02-03 20:00:00+01:00</td>\n",
-       "      <td>a boire et a manger</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>318968</th>\n",
-       "      <td>19095</td>\n",
-       "      <td>lastname19095</td>\n",
-       "      <td>firstname19095</td>\n",
-       "      <td>1979-07-16</td>\n",
-       "      <td>email19095</td>\n",
-       "      <td>6</td>\n",
-       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
-       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>False</td>\n",
-       "      <td>...</td>\n",
-       "      <td>1556</td>\n",
-       "      <td>1244277</td>\n",
-       "      <td>19095</td>\n",
-       "      <td>2019-12-31 11:04:07+01:00</td>\n",
-       "      <td>1</td>\n",
-       "      <td>False</td>\n",
-       "      <td>5.5</td>\n",
-       "      <td>False</td>\n",
-       "      <td>2020-02-03 20:00:00+01:00</td>\n",
-       "      <td>a boire et a manger</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>318969 rows × 52 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "            id        lastname       firstname   birthdate       email  \\\n",
-       "0       405082  lastname405082             NaN         NaN         NaN   \n",
-       "1       405082  lastname405082             NaN         NaN         NaN   \n",
-       "2       411168  lastname411168             NaN         NaN         NaN   \n",
-       "3       411168  lastname411168             NaN         NaN         NaN   \n",
-       "4         4380    lastname4380   firstname4380         NaN         NaN   \n",
-       "...        ...             ...             ...         ...         ...   \n",
-       "318964   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "318965   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "318966   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "318967   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "318968   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
-       "\n",
-       "        street_id                        created_at  \\\n",
-       "0               6  2023-01-12 06:30:31.197484+01:00   \n",
-       "1               6  2023-01-12 06:30:31.197484+01:00   \n",
-       "2               6  2023-03-17 06:30:35.431967+01:00   \n",
-       "3               6  2023-03-17 06:30:35.431967+01:00   \n",
-       "4               1  2021-04-22 14:51:55.432952+02:00   \n",
-       "...           ...                               ...   \n",
-       "318964          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "318965          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "318966          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "318967          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "318968          6  2021-04-22 15:06:30.120537+02:00   \n",
-       "\n",
-       "                              updated_at  civility  is_partner  ...  \\\n",
-       "0       2023-01-12 06:30:31.197484+01:00       NaN       False  ...   \n",
-       "1       2023-01-12 06:30:31.197484+01:00       NaN       False  ...   \n",
-       "2       2023-03-17 06:30:35.431967+01:00       NaN       False  ...   \n",
-       "3       2023-03-17 06:30:35.431967+01:00       NaN       False  ...   \n",
-       "4       2022-04-14 11:41:33.738500+02:00       NaN       False  ...   \n",
-       "...                                  ...       ...         ...  ...   \n",
-       "318964  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "318965  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "318966  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "318967  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "318968  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
-       "\n",
-       "        tenant_id     id_x  customer_id              purchase_date  type_of  \\\n",
-       "0            1556   992423       405082  2023-01-11 17:08:41+01:00        3   \n",
-       "1            1556   992423       405082  2023-01-11 17:08:41+01:00        3   \n",
-       "2            1556  1053934       411168  2023-03-16 16:23:10+01:00        3   \n",
-       "3            1556  1053934       411168  2023-03-16 16:23:10+01:00        3   \n",
-       "4            1556  1189141         4380  2020-11-26 13:12:53+01:00        3   \n",
-       "...           ...      ...          ...                        ...      ...   \n",
-       "318964       1556  1090839        19095  2019-05-19 21:18:36+02:00        1   \n",
-       "318965       1556  1090839        19095  2019-05-19 21:18:36+02:00        1   \n",
-       "318966       1556  1090839        19095  2019-05-19 21:18:36+02:00        1   \n",
-       "318967       1556  1244277        19095  2019-12-31 11:04:07+01:00        1   \n",
-       "318968       1556  1244277        19095  2019-12-31 11:04:07+01:00        1   \n",
-       "\n",
-       "        is_from_subscription amount  is_full_price            start_date_time  \\\n",
-       "0                      False   13.0          False  2023-02-06 20:00:00+01:00   \n",
-       "1                      False   13.0          False  2023-02-06 20:00:00+01:00   \n",
-       "2                      False   62.0          False  2023-03-19 16:00:00+01:00   \n",
-       "3                      False   62.0          False  2023-03-19 16:00:00+01:00   \n",
-       "4                      False   51.3          False  2020-12-01 20:00:00+01:00   \n",
-       "...                      ...    ...            ...                        ...   \n",
-       "318964                 False    4.5          False  2019-05-27 20:00:00+02:00   \n",
-       "318965                 False    4.5          False  2019-05-27 20:00:00+02:00   \n",
-       "318966                 False    4.5          False  2019-05-27 20:00:00+02:00   \n",
-       "318967                 False    5.5          False  2020-02-03 20:00:00+01:00   \n",
-       "318968                 False    5.5          False  2020-02-03 20:00:00+01:00   \n",
-       "\n",
-       "                  event_name  \n",
-       "0                      zaide  \n",
-       "1                      zaide  \n",
-       "2               luisa miller  \n",
-       "3               luisa miller  \n",
-       "4       iphigenie en tauride  \n",
-       "...                      ...  \n",
-       "318964          entre femmes  \n",
-       "318965          entre femmes  \n",
-       "318966          entre femmes  \n",
-       "318967   a boire et a manger  \n",
-       "318968   a boire et a manger  \n",
-       "\n",
-       "[318969 rows x 52 columns]"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Jointure\n",
-    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
-    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
-    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
-    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
-    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
-    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
-    "df_customer_event"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/Notebook_AR.ipynb
+++ b/Notebook_AR.ipynb
@ -6103,6 +6103,403 @@
    "representation_theme.head()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "e274e3cc-1b41-43e0-8412-1563166060cb",
+   "metadata": {},
+   "source": [
+    "## Price Table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "id": "c52621e7-01de-48dc-b572-2974542a8be5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  bdc2324-data/1/1product_packs.csv\n",
+      "Shape :  (1, 6)\n",
+      "Number of columns :  4\n",
+      "Columns :  Index(['id', 'identifier', 'name', 'type_of'], dtype='object')\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>type_of</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id  name  type_of\n",
+       "0   1   NaN        0"
+      ]
+     },
+     "execution_count": 112,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "product_packs = load_dataset(\"1product_packs.csv\")\n",
+    "product_packs.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "id": "9e4f60ab-9a2c-4090-b0c4-f9a1530b2d39",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  bdc2324-data/1/1pricing_formulas.csv\n",
+      "Shape :  (556, 6)\n",
+      "Number of columns :  4\n",
+      "Columns :  Index(['id', 'identifier', 'name', 'extra_field'], dtype='object')\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>extra_field</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>41909</td>\n",
+       "      <td>visite mécènes 1h30</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>502</td>\n",
+       "      <td>entree mucem tp( expo picasso)</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>504</td>\n",
+       "      <td>nombre de personnes cinema</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>117</td>\n",
+       "      <td>spectacle tarif e famille tr</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1496</td>\n",
+       "      <td>billet nb famille mecene 1a</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      id                            name  extra_field\n",
+       "0  41909             visite mécènes 1h30          NaN\n",
+       "1    502  entree mucem tp( expo picasso)          NaN\n",
+       "2    504      nombre de personnes cinema          NaN\n",
+       "3    117    spectacle tarif e famille tr          NaN\n",
+       "4   1496     billet nb famille mecene 1a          NaN"
+      ]
+     },
+     "execution_count": 114,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pricing_formula = load_dataset(\"1pricing_formulas.csv\")\n",
+    "pricing_formula.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "id": "247b5c45-a18a-4cfd-86b4-d3453e157bcd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  bdc2324-data/1/1type_of_pricing_formulas.csv\n",
+      "Shape :  (568, 6)\n",
+      "Number of columns :  4\n",
+      "Columns :  Index(['id', 'type_of_id', 'pricing_formula_id', 'identifier'], dtype='object')\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>type_of_id</th>\n",
+       "      <th>pricing_formula_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>127</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2425</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2937</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>1</td>\n",
+       "      <td>48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id  type_of_id  pricing_formula_id\n",
+       "0   1           1                 127\n",
+       "1   2           1                2425\n",
+       "2   3           1                2937\n",
+       "3   4           1                  48\n",
+       "4   5           1                   7"
+      ]
+     },
+     "execution_count": 115,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type_pricing_formula = load_dataset(\"1type_of_pricing_formulas.csv\")\n",
+    "type_pricing_formula.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 117,
+   "id": "4b48f7b3-0f06-4ef6-9355-5016af82f49c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File path :  bdc2324-data/1/1products_groups.csv\n",
+      "Shape :  (92973, 9)\n",
+      "Number of columns :  7\n",
+      "Columns :  Index(['id', 'category_id', 'pricing_formula_id', 'representation_id',\n",
+      "       'percent_price', 'max_price', 'min_price'],\n",
+      "      dtype='object')\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>category_id</th>\n",
+       "      <th>pricing_formula_id</th>\n",
+       "      <th>representation_id</th>\n",
+       "      <th>percent_price</th>\n",
+       "      <th>max_price</th>\n",
+       "      <th>min_price</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2735</td>\n",
+       "      <td>8</td>\n",
+       "      <td>97</td>\n",
+       "      <td>1534</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>156773</td>\n",
+       "      <td>5</td>\n",
+       "      <td>9</td>\n",
+       "      <td>82519</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>14387</td>\n",
+       "      <td>16</td>\n",
+       "      <td>79</td>\n",
+       "      <td>8046</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2770</td>\n",
+       "      <td>2</td>\n",
+       "      <td>37</td>\n",
+       "      <td>1563</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>27179</td>\n",
+       "      <td>13</td>\n",
+       "      <td>119</td>\n",
+       "      <td>14192</td>\n",
+       "      <td>100.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       id  category_id  pricing_formula_id  representation_id  percent_price  \\\n",
+       "0    2735            8                  97               1534          100.0   \n",
+       "1  156773            5                   9              82519          100.0   \n",
+       "2   14387           16                  79               8046          100.0   \n",
+       "3    2770            2                  37               1563          100.0   \n",
+       "4   27179           13                 119              14192          100.0   \n",
+       "\n",
+       "   max_price  min_price  \n",
+       "0        0.0        0.0  \n",
+       "1        0.0        0.0  \n",
+       "2        0.0        0.0  \n",
+       "3        0.0        0.0  \n",
+       "4        0.0        0.0  "
+      ]
+     },
+     "execution_count": 117,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "product_groups = load_dataset(\"1products_groups.csv\")\n",
+    "product_groups.head()"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "71c26a38-6818-42df-8aee-0135681a5563",
@ -6741,6 +7138,9 @@
   "outputs": [],
   "source": [
    "def uniform_product_df():\n",
+    "    \"\"\"\n",
+    "    This function returns the uniform product dataset\n",
+    "    \"\"\"\n",
    "    print(\"Products theme columns : \", products_theme.columns)\n",
    "    print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
    "    print(\"\\n Events theme columns : \", events_theme.columns)\n",
--- a/TP_merge_tables_clean.ipynb
+++ b/TP_merge_tables_clean.ipynb
--- a/Traitement_Fanta.ipynb
+++ b/Traitement_Fanta.ipynb