diff --git a/.gitignore b/.gitignore
index f540ee9..3993ec5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,187 @@
-.ipynb_checkpoints/Clean-Notebook-checkpoint.ipynb
+# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
+# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,python
+
+### JupyterNotebooks ###
+# gitignore template for Jupyter Notebooks
+# website: http://jupyter.org/
+
+.ipynb_checkpoints
+*/.ipynb_checkpoints/*
+
+# IPython
+profile_default/
+ipython_config.py
+
+# Remove previous ipynb_checkpoints
+# git rm -r .ipynb_checkpoints/
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+
+# IPython
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
\ No newline at end of file
diff --git a/.gitignore.txt b/.gitignore.txt
deleted file mode 100644
index ff6eda2..0000000
--- a/.gitignore.txt
+++ /dev/null
@@ -1,187 +0,0 @@
-# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
-# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,python
-
-### JupyterNotebooks ###
-# gitignore template for Jupyter Notebooks
-# website: http://jupyter.org/
-
-.ipynb_checkpoints
-*/.ipynb_checkpoints/*
-
-# IPython
-profile_default/
-ipython_config.py
-
-# Remove previous ipynb_checkpoints
-# git rm -r .ipynb_checkpoints/
-
-### Python ###
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-# Usually these files are written by a python script from a template
-# before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-
-# IPython
-
-# pyenv
-# For a library or package, you might want to ignore these files since the code is
-# intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-# However, in case of collaboration, if having platform-specific dependencies or dependencies
-# having no cross-platform support, pipenv may install dependencies that don't work, or not
-# install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-# This is especially recommended for binary packages to ensure reproducibility, and is more
-# commonly ignored for libraries.
-# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-# in version control.
-# https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-# and can be added to the global gitignore or merged into this file. For a more nuclear
-# option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
-### Python Patch ###
-# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
-poetry.toml
-
-# ruff
-.ruff_cache/
-
-# LSP config files
-pyrightconfig.json
-
-# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
\ No newline at end of file
diff --git a/.ipynb_checkpoints/Notebook_AJ-checkpoint.ipynb b/.ipynb_checkpoints/Notebook_AJ-checkpoint.ipynb
deleted file mode 100644
index 2cdf609..0000000
--- a/.ipynb_checkpoints/Notebook_AJ-checkpoint.ipynb
+++ /dev/null
@@ -1,76 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['bdc2324-data/1',\n",
- " 'bdc2324-data/10',\n",
- " 'bdc2324-data/101',\n",
- " 'bdc2324-data/11',\n",
- " 'bdc2324-data/12',\n",
- " 'bdc2324-data/13',\n",
- " 'bdc2324-data/14',\n",
- " 'bdc2324-data/2',\n",
- " 'bdc2324-data/3',\n",
- " 'bdc2324-data/4',\n",
- " 'bdc2324-data/5',\n",
- " 'bdc2324-data/6',\n",
- " 'bdc2324-data/7',\n",
- " 'bdc2324-data/8',\n",
- " 'bdc2324-data/9']"
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import os\n",
- "import s3fs\n",
- "\n",
- "# Create filesystem object\n",
- "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
- "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
- "\n",
- "BUCKET = \"bdc2324-data\"\n",
- "fs.ls(BUCKET)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "023bfa2b-97c2-4d53-80fb-e2290c73b92f",
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb
new file mode 100644
index 0000000..3f3b639
--- /dev/null
+++ b/0_Cleaning_and_merge.ipynb
@@ -0,0 +1,1465 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672",
+ "metadata": {},
+ "source": [
+ "# Business Data Challenge - Team 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "15103481-8d74-404c-aa09-7601fe7730da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import s3fs\n",
+ "import re"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ee97665c-39af-4c1c-a62b-c9c79feae18f",
+ "metadata": {},
+ "source": [
+ "Configuration de l'accès aux données"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create filesystem object\n",
+ "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
+ "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a",
+ "metadata": {},
+ "source": [
+ "# Exemple sur Company 1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "db26e59a-927c-407e-b54b-1815473b0b34",
+ "metadata": {},
+ "source": [
+ "## Chargement données"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "699664b9-eee4-4f8d-a207-e524526560c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "BUCKET = \"bdc2324-data/1\"\n",
+ "liste_database = fs.ls(BUCKET)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "aaf64d60-bf92-470c-8210-d09abd6a653e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['bdc2324-data/1/1campaign_stats.csv',\n",
+ " 'bdc2324-data/1/1campaigns.csv',\n",
+ " 'bdc2324-data/1/1categories.csv',\n",
+ " 'bdc2324-data/1/1countries.csv',\n",
+ " 'bdc2324-data/1/1currencies.csv',\n",
+ " 'bdc2324-data/1/1customer_target_mappings.csv',\n",
+ " 'bdc2324-data/1/1customersplus.csv',\n",
+ " 'bdc2324-data/1/1event_types.csv',\n",
+ " 'bdc2324-data/1/1events.csv',\n",
+ " 'bdc2324-data/1/1facilities.csv',\n",
+ " 'bdc2324-data/1/1link_stats.csv',\n",
+ " 'bdc2324-data/1/1pricing_formulas.csv',\n",
+ " 'bdc2324-data/1/1product_packs.csv',\n",
+ " 'bdc2324-data/1/1products.csv',\n",
+ " 'bdc2324-data/1/1products_groups.csv',\n",
+ " 'bdc2324-data/1/1purchases.csv',\n",
+ " 'bdc2324-data/1/1representation_category_capacities.csv',\n",
+ " 'bdc2324-data/1/1representations.csv',\n",
+ " 'bdc2324-data/1/1seasons.csv',\n",
+ " 'bdc2324-data/1/1structure_tag_mappings.csv',\n",
+ " 'bdc2324-data/1/1suppliers.csv',\n",
+ " 'bdc2324-data/1/1tags.csv',\n",
+ " 'bdc2324-data/1/1target_types.csv',\n",
+ " 'bdc2324-data/1/1targets.csv',\n",
+ " 'bdc2324-data/1/1tickets.csv',\n",
+ " 'bdc2324-data/1/1type_of_categories.csv',\n",
+ " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n",
+ " 'bdc2324-data/1/1type_ofs.csv']"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "liste_database"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_50143/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " df = pd.read_csv(file_in)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# loop to create dataframes from liste\n",
+ "files_path = liste_database\n",
+ "\n",
+ "client_number = files_path[0].split(\"/\")[1]\n",
+ "df_prefix = \"df\" + str(client_number) + \"_\"\n",
+ "\n",
+ "for i in range(len(files_path)) :\n",
+ " current_path = files_path[i]\n",
+ " with fs.open(current_path, mode=\"rb\") as file_in:\n",
+ " df = pd.read_csv(file_in)\n",
+ " # the pattern of the name is df1xxx\n",
+ " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
+ " globals()[nom_dataframe] = df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716",
+ "metadata": {},
+ "source": [
+ "## Cleaning functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def cleaning_date(df, column_name):\n",
+ " \"\"\"\n",
+ " Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
+ "\n",
+ " Parameters:\n",
+ " - df: DataFrame\n",
+ " Le DataFrame contenant la colonne à nettoyer.\n",
+ " - column_name: str\n",
+ " Le nom de la colonne à nettoyer.\n",
+ "\n",
+ " Returns:\n",
+ " - DataFrame\n",
+ " Le DataFrame modifié avec la colonne nettoyée.\n",
+ " \"\"\"\n",
+ " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "398804d8-2225-4fd3-bceb-75ab1588e359",
+ "metadata": {},
+ "source": [
+ "## Preprocessing"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6",
+ "metadata": {},
+ "source": [
+ "## customer_plus"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7e7b90ce-da54-4f00-bc34-64c543b0858f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656",
+ "metadata": {},
+ "source": [
+ "## Ticket area"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "b95464b1-26bc-4aac-84b4-45da83b92251",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fonction de nettoyage et selection\n",
+ "def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n",
+ " # Base des tickets\n",
+ " tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
+ " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
+ "\n",
+ " # Base des fournisseurs\n",
+ " suppliers = suppliers[['id', 'name']]\n",
+ " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n",
+ "\n",
+ " # Base des types de billets\n",
+ " # type_ofs = type_ofs[['id', 'name', 'children']]\n",
+ " # type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n",
+ "\n",
+ " # Base des achats\n",
+ " # Nettoyage de la date d'achat\n",
+ " cleaning_date(purchases, 'purchase_date')\n",
+ " # Selection des variables\n",
+ " purchases = purchases[['id', 'purchase_date', 'customer_id']]\n",
+ "\n",
+ " # Fusions \n",
+ " # Fusion avec fournisseurs\n",
+ " ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
+ " ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
+ " \n",
+ " # # Fusion avec type de tickets\n",
+ " # ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
+ " # ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
+ " \n",
+ " # Fusion avec achats\n",
+ " ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
+ " ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n",
+ "\n",
+ " return ticket_information"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_50143/1320335767.py:5: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n",
+ "/tmp/ipykernel_50143/1320335767.py:9: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ticket_id | \n",
+ " product_id | \n",
+ " is_from_subscription | \n",
+ " type_of | \n",
+ " supplier_name | \n",
+ " purchase_date | \n",
+ " customer_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 13070859 | \n",
+ " 225251 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 13070860 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13070861 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 13070862 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 13070863 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1826667 | \n",
+ " 20662815 | \n",
+ " 405689 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 17:23:54+00:00 | \n",
+ " 1256135 | \n",
+ "
\n",
+ " \n",
+ " 1826668 | \n",
+ " 20662816 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 18:32:18+00:00 | \n",
+ " 1256136 | \n",
+ "
\n",
+ " \n",
+ " 1826669 | \n",
+ " 20662817 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 18:32:18+00:00 | \n",
+ " 1256136 | \n",
+ "
\n",
+ " \n",
+ " 1826670 | \n",
+ " 20662818 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 19:30:28+00:00 | \n",
+ " 1256137 | \n",
+ "
\n",
+ " \n",
+ " 1826671 | \n",
+ " 20662819 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 19:30:28+00:00 | \n",
+ " 1256137 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1826672 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ticket_id product_id is_from_subscription type_of supplier_name \\\n",
+ "0 13070859 225251 False 1 vente en ligne \n",
+ "1 13070860 224914 False 1 vente en ligne \n",
+ "2 13070861 224914 False 1 vente en ligne \n",
+ "3 13070862 224914 False 1 vente en ligne \n",
+ "4 13070863 224914 False 1 vente en ligne \n",
+ "... ... ... ... ... ... \n",
+ "1826667 20662815 405689 False 1 vente en ligne \n",
+ "1826668 20662816 403658 False 1 vente en ligne \n",
+ "1826669 20662817 403658 False 1 vente en ligne \n",
+ "1826670 20662818 403658 False 1 vente en ligne \n",
+ "1826671 20662819 403658 False 1 vente en ligne \n",
+ "\n",
+ " purchase_date customer_id \n",
+ "0 2018-12-28 14:47:50+00:00 48187 \n",
+ "1 2018-12-28 14:47:50+00:00 48187 \n",
+ "2 2018-12-28 14:47:50+00:00 48187 \n",
+ "3 2018-12-28 14:47:50+00:00 48187 \n",
+ "4 2018-12-28 14:47:50+00:00 48187 \n",
+ "... ... ... \n",
+ "1826667 2023-11-08 17:23:54+00:00 1256135 \n",
+ "1826668 2023-11-08 18:32:18+00:00 1256136 \n",
+ "1826669 2023-11-08 18:32:18+00:00 1256136 \n",
+ "1826670 2023-11-08 19:30:28+00:00 1256137 \n",
+ "1826671 2023-11-08 19:30:28+00:00 1256137 \n",
+ "\n",
+ "[1826672 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_ticket_information"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "096e47f4-1d65-4575-989d-83227eedad2b",
+ "metadata": {},
+ "source": [
+ "## Target area"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "baed146a-9d3a-4397-a812-3d50c9a2f038",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n",
+ " # Target.csv cleaning\n",
+ " targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n",
+ " targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
+ " \n",
+ " # target_type cleaning\n",
+ " target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
+ " \n",
+ " #customer_target_mappings cleaning\n",
+ " customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
+ " \n",
+ " # Merge target et target_type\n",
+ " targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
+ " targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
+ " \n",
+ " # Merge\n",
+ " targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
+ " targets_full.drop(['target_id'], axis = 1, inplace=True)\n",
+ "\n",
+ " return targets_full"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "5fbfd88b-b94c-489c-9201-670e96e453e7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_50143/3848597476.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer_id | \n",
+ "
\n",
+ " \n",
+ " target_name | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " consentement optin mediation specialisee | \n",
+ " 150000 | \n",
+ "
\n",
+ " \n",
+ " consentement optin jeune public | \n",
+ " 149979 | \n",
+ "
\n",
+ " \n",
+ " consentement optin b2c | \n",
+ " 108909 | \n",
+ "
\n",
+ " \n",
+ " Arenametrix_bascule tel vers sib | \n",
+ " 35216 | \n",
+ "
\n",
+ " \n",
+ " consentement optout b2c | \n",
+ " 34523 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " Automation_parrainage_newsletter_handicap_visuel | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " consentement optout mediation specialisee | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " Inscrits NL LSF formulaire | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " Market auto - contacts inactifs post-scénario | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " Inactifs - fin du scénario | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
283 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer_id\n",
+ "target_name \n",
+ "consentement optin mediation specialisee 150000\n",
+ "consentement optin jeune public 149979\n",
+ "consentement optin b2c 108909\n",
+ "Arenametrix_bascule tel vers sib 35216\n",
+ "consentement optout b2c 34523\n",
+ "... ...\n",
+ "Automation_parrainage_newsletter_handicap_visuel 1\n",
+ "consentement optout mediation specialisee 1\n",
+ "Inscrits NL LSF formulaire 1\n",
+ "Market auto - contacts inactifs post-scénario 1\n",
+ "Inactifs - fin du scénario 1\n",
+ "\n",
+ "[283 rows x 1 columns]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "4417ff51-f501-4ab9-a192-4ab75764a8ed",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer_id | \n",
+ "
\n",
+ " \n",
+ " target_name | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Arenametrix_bascule tel vers sib | \n",
+ " 35216 | \n",
+ "
\n",
+ " \n",
+ " Autres_interet_exposition | \n",
+ " 1021 | \n",
+ "
\n",
+ " \n",
+ " COM Inscrits NL générale (historique) | \n",
+ " 23005 | \n",
+ "
\n",
+ " \n",
+ " Contacts_prenomsdoubles | \n",
+ " 11643 | \n",
+ "
\n",
+ " \n",
+ " DDCP MD Procès du Siècle | \n",
+ " 1684 | \n",
+ "
\n",
+ " \n",
+ " DDCP Newsletter centres de loisirs | \n",
+ " 1032 | \n",
+ "
\n",
+ " \n",
+ " DDCP Newsletter enseignants | \n",
+ " 4510 | \n",
+ "
\n",
+ " \n",
+ " DDCP Newsletter jeune public | \n",
+ " 3862 | \n",
+ "
\n",
+ " \n",
+ " DDCP Newsletter relais champ social | \n",
+ " 2270 | \n",
+ "
\n",
+ " \n",
+ " DDCP PROMO Participants ateliers (adultes et enfants) | \n",
+ " 1954 | \n",
+ "
\n",
+ " \n",
+ " DDCP billets famille | \n",
+ " 3609 | \n",
+ "
\n",
+ " \n",
+ " DDCP promo MD pass musées dps oct 2018 | \n",
+ " 1785 | \n",
+ "
\n",
+ " \n",
+ " DDCP promo Plan B 2019 (concerts) | \n",
+ " 1948 | \n",
+ "
\n",
+ " \n",
+ " DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers) | \n",
+ " 1293 | \n",
+ "
\n",
+ " \n",
+ " DDCP rentrée culturelle 2023 | \n",
+ " 1757 | \n",
+ "
\n",
+ " \n",
+ " DDCP_marseille_jazz_2023 | \n",
+ " 1043 | \n",
+ "
\n",
+ " \n",
+ " DRE Festival Jean Rouch | \n",
+ " 1502 | \n",
+ "
\n",
+ " \n",
+ " DRE MucemLab | \n",
+ " 2302 | \n",
+ "
\n",
+ " \n",
+ " DRE chercheurs | \n",
+ " 1557 | \n",
+ "
\n",
+ " \n",
+ " DRE institutionnels | \n",
+ " 2229 | \n",
+ "
\n",
+ " \n",
+ " FORMATION _ acheteurs optin last year | \n",
+ " 10485 | \n",
+ "
\n",
+ " \n",
+ " Inscrits NL générale (export_291019 + operation_videomaton) | \n",
+ " 14086 | \n",
+ "
\n",
+ " \n",
+ " Inscrits NL générale site web | \n",
+ " 3732 | \n",
+ "
\n",
+ " \n",
+ " Inscrits NL jeune public site web | \n",
+ " 1249 | \n",
+ "
\n",
+ " \n",
+ " Votre première liste | \n",
+ " 3715 | \n",
+ "
\n",
+ " \n",
+ " consentement optin b2b | \n",
+ " 12735 | \n",
+ "
\n",
+ " \n",
+ " consentement optin b2c | \n",
+ " 108909 | \n",
+ "
\n",
+ " \n",
+ " consentement optin dre | \n",
+ " 4527 | \n",
+ "
\n",
+ " \n",
+ " consentement optin jeune public | \n",
+ " 149979 | \n",
+ "
\n",
+ " \n",
+ " consentement optin mediation specialisee | \n",
+ " 150000 | \n",
+ "
\n",
+ " \n",
+ " consentement optin newsletter generale | \n",
+ " 22095 | \n",
+ "
\n",
+ " \n",
+ " consentement optin scolaires | \n",
+ " 4849 | \n",
+ "
\n",
+ " \n",
+ " consentement optout b2b | \n",
+ " 14219 | \n",
+ "
\n",
+ " \n",
+ " consentement optout b2c | \n",
+ " 34523 | \n",
+ "
\n",
+ " \n",
+ " consentement optout dre | \n",
+ " 14328 | \n",
+ "
\n",
+ " \n",
+ " consentement optout newsletter generale | \n",
+ " 18855 | \n",
+ "
\n",
+ " \n",
+ " consentement optout scolaires | \n",
+ " 15744 | \n",
+ "
\n",
+ " \n",
+ " ddcp_md_scene_ouverte_au_talent | \n",
+ " 1577 | \n",
+ "
\n",
+ " \n",
+ " ddcp_promo_MD_billet_musée_oct_2019_agarder2 | \n",
+ " 5482 | \n",
+ "
\n",
+ " \n",
+ " ddcp_promo_md_musée_dps 011019 | \n",
+ " 6010 | \n",
+ "
\n",
+ " \n",
+ " ddcp_promo_visiteurs occasionnels_musee_8mois | \n",
+ " 6640 | \n",
+ "
\n",
+ " \n",
+ " ddcp_visiteurs dps 010622 | \n",
+ " 12355 | \n",
+ "
\n",
+ " \n",
+ " festival_jean_rouch | \n",
+ " 1502 | \n",
+ "
\n",
+ " \n",
+ " rappel po barvalo | \n",
+ " 1248 | \n",
+ "
\n",
+ " \n",
+ " structures_etiquette champ social | \n",
+ " 1488 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer_id\n",
+ "target_name \n",
+ "Arenametrix_bascule tel vers sib 35216\n",
+ "Autres_interet_exposition 1021\n",
+ "COM Inscrits NL générale (historique) 23005\n",
+ "Contacts_prenomsdoubles 11643\n",
+ "DDCP MD Procès du Siècle 1684\n",
+ "DDCP Newsletter centres de loisirs 1032\n",
+ "DDCP Newsletter enseignants 4510\n",
+ "DDCP Newsletter jeune public 3862\n",
+ "DDCP Newsletter relais champ social 2270\n",
+ "DDCP PROMO Participants ateliers (adultes et en... 1954\n",
+ "DDCP billets famille 3609\n",
+ "DDCP promo MD pass musées dps oct 2018 1785\n",
+ "DDCP promo Plan B 2019 (concerts) 1948\n",
+ "DDCP promo spectateurs prog 21-22 (spectacles, ... 1293\n",
+ "DDCP rentrée culturelle 2023 1757\n",
+ "DDCP_marseille_jazz_2023 1043\n",
+ "DRE Festival Jean Rouch 1502\n",
+ "DRE MucemLab 2302\n",
+ "DRE chercheurs 1557\n",
+ "DRE institutionnels 2229\n",
+ "FORMATION _ acheteurs optin last year 10485\n",
+ "Inscrits NL générale (export_291019 + operation... 14086\n",
+ "Inscrits NL générale site web 3732\n",
+ "Inscrits NL jeune public site web 1249\n",
+ "Votre première liste 3715\n",
+ "consentement optin b2b 12735\n",
+ "consentement optin b2c 108909\n",
+ "consentement optin dre 4527\n",
+ "consentement optin jeune public 149979\n",
+ "consentement optin mediation specialisee 150000\n",
+ "consentement optin newsletter generale 22095\n",
+ "consentement optin scolaires 4849\n",
+ "consentement optout b2b 14219\n",
+ "consentement optout b2c 34523\n",
+ "consentement optout dre 14328\n",
+ "consentement optout newsletter generale 18855\n",
+ "consentement optout scolaires 15744\n",
+ "ddcp_md_scene_ouverte_au_talent 1577\n",
+ "ddcp_promo_MD_billet_musée_oct_2019_agarder2 5482\n",
+ "ddcp_promo_md_musée_dps 011019 6010\n",
+ "ddcp_promo_visiteurs occasionnels_musee_8mois 6640\n",
+ "ddcp_visiteurs dps 010622 12355\n",
+ "festival_jean_rouch 1502\n",
+ "rappel po barvalo 1248\n",
+ "structures_etiquette champ social 1488"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n",
+ "df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f",
+ "metadata": {},
+ "source": [
+ "## Campaings area"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "d883cc7b-ac43-4485-b86f-eaf595fbad85",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n",
+ " # campaign_stats cleaning \n",
+ " campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
+ " cleaning_date(campaign_stats, 'opened_at')\n",
+ " cleaning_date(campaign_stats, 'sent_at')\n",
+ " cleaning_date(campaign_stats, 'delivered_at')\n",
+ " \n",
+ " # campaigns cleaning\n",
+ " campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
+ " cleaning_date(campaigns, 'campaign_sent_at')\n",
+ " \n",
+ " # Merge \n",
+ " campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n",
+ " campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n",
+ "\n",
+ " return campaigns_full"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
+ "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
+ "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "c24457e7-3cad-451a-a65b-7373b656bd6e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " customer_id | \n",
+ " opened_at | \n",
+ " sent_at | \n",
+ " delivered_at | \n",
+ " campaign_name | \n",
+ " campaign_service_id | \n",
+ " campaign_sent_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 19793 | \n",
+ " 112597 | \n",
+ " NaT | \n",
+ " 2021-03-28 16:01:09+00:00 | \n",
+ " 2021-03-28 16:24:18+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 14211 | \n",
+ " 113666 | \n",
+ " NaT | \n",
+ " 2021-03-28 16:01:09+00:00 | \n",
+ " 2021-03-28 16:21:02+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13150 | \n",
+ " 280561 | \n",
+ " NaT | \n",
+ " 2021-03-28 16:00:59+00:00 | \n",
+ " 2021-03-28 16:08:45+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 7073 | \n",
+ " 101007 | \n",
+ " 2021-03-28 18:11:06+00:00 | \n",
+ " 2021-03-28 16:00:59+00:00 | \n",
+ " 2021-03-28 16:09:47+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5175 | \n",
+ " 103972 | \n",
+ " NaT | \n",
+ " 2021-03-28 16:01:06+00:00 | \n",
+ " 2021-03-28 16:05:03+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 6214803 | \n",
+ " 8302994 | \n",
+ " 266155 | \n",
+ " 2023-10-23 09:43:25+00:00 | \n",
+ " 2023-10-23 09:32:33+00:00 | \n",
+ " 2023-10-23 09:32:34+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ " 6214804 | \n",
+ " 8303307 | \n",
+ " 21355 | \n",
+ " 2023-10-23 09:44:02+00:00 | \n",
+ " 2023-10-23 09:32:49+00:00 | \n",
+ " 2023-10-23 09:32:49+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ " 6214805 | \n",
+ " 8304346 | \n",
+ " 21849 | \n",
+ " 2023-10-23 09:45:52+00:00 | \n",
+ " 2023-10-23 09:33:28+00:00 | \n",
+ " 2023-10-23 09:33:29+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ " 6214806 | \n",
+ " 8302037 | \n",
+ " 667789 | \n",
+ " 2023-10-23 09:47:32+00:00 | \n",
+ " 2023-10-23 09:31:53+00:00 | \n",
+ " 2023-10-23 09:31:54+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ " 6214807 | \n",
+ " 8304939 | \n",
+ " 294154 | \n",
+ " NaT | \n",
+ " 2023-10-23 09:33:54+00:00 | \n",
+ " 2023-10-23 09:33:55+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
6214808 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id customer_id opened_at \\\n",
+ "0 19793 112597 NaT \n",
+ "1 14211 113666 NaT \n",
+ "2 13150 280561 NaT \n",
+ "3 7073 101007 2021-03-28 18:11:06+00:00 \n",
+ "4 5175 103972 NaT \n",
+ "... ... ... ... \n",
+ "6214803 8302994 266155 2023-10-23 09:43:25+00:00 \n",
+ "6214804 8303307 21355 2023-10-23 09:44:02+00:00 \n",
+ "6214805 8304346 21849 2023-10-23 09:45:52+00:00 \n",
+ "6214806 8302037 667789 2023-10-23 09:47:32+00:00 \n",
+ "6214807 8304939 294154 NaT \n",
+ "\n",
+ " sent_at delivered_at \\\n",
+ "0 2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00 \n",
+ "1 2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00 \n",
+ "2 2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00 \n",
+ "3 2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00 \n",
+ "4 2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00 \n",
+ "... ... ... \n",
+ "6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00 \n",
+ "6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00 \n",
+ "6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00 \n",
+ "6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00 \n",
+ "6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00 \n",
+ "\n",
+ " campaign_name campaign_service_id \\\n",
+ "0 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "1 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "2 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "3 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "4 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "... ... ... \n",
+ "6214803 dre_nov_2023 1318 \n",
+ "6214804 dre_nov_2023 1318 \n",
+ "6214805 dre_nov_2023 1318 \n",
+ "6214806 dre_nov_2023 1318 \n",
+ "6214807 dre_nov_2023 1318 \n",
+ "\n",
+ " campaign_sent_at \n",
+ "0 2021-03-27 23:00:00+00:00 \n",
+ "1 2021-03-27 23:00:00+00:00 \n",
+ "2 2021-03-27 23:00:00+00:00 \n",
+ "3 2021-03-27 23:00:00+00:00 \n",
+ "4 2021-03-27 23:00:00+00:00 \n",
+ "... ... \n",
+ "6214803 2023-10-23 09:31:17+00:00 \n",
+ "6214804 2023-10-23 09:31:17+00:00 \n",
+ "6214805 2023-10-23 09:31:17+00:00 \n",
+ "6214806 2023-10-23 09:31:17+00:00 \n",
+ "6214807 2023-10-23 09:31:17+00:00 \n",
+ "\n",
+ "[6214808 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_campaigns_information"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "e2c88552-b863-47a2-be23-8d2898fb28bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def campaigns_kpi(campaigns_information = None):\n",
+ " # Nombre de campagnes de mails\n",
+ " nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
+ " nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n",
+ " # Temps d'ouverture en min moyen \n",
+ " campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n",
+ " time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n",
+ "\n",
+ " # Nombre de mail ouvert \n",
+ " opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n",
+ " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
+ " opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n",
+ " opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n",
+ "\n",
+ " # Fusion des indicateurs\n",
+ " campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n",
+ " campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n",
+ "\n",
+ " # Remplir les NaN : nb_campaigns_opened\n",
+ " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n",
+ "\n",
+ " # Remplir les NaT : time_to_open (??)\n",
+ "\n",
+ " return campaigns_reduced\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "id": "24537647-bc29-4777-9848-ac4120a4aa60",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_50143/2679359833.py:11: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n",
+ "/tmp/ipykernel_50143/2679359833.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n",
+ "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n",
+ "\n",
+ "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n",
+ "\n",
+ "\n",
+ " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_campaigns_kpi = campaigns_kpi(campaigns_information = df1_campaigns_information) "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " customer_id | \n",
+ " nb_campaigns | \n",
+ " nb_campaigns_opened | \n",
+ " time_to_open | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 222 | \n",
+ " 124.0 | \n",
+ " 1 days 00:28:30.169354838 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 7 | \n",
+ " 7.0 | \n",
+ " 1 days 04:31:01.428571428 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5 | \n",
+ " 4 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 6 | \n",
+ " 20 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 130467 | \n",
+ " 1256097 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 0 days 02:11:15 | \n",
+ "
\n",
+ " \n",
+ " 130468 | \n",
+ " 1256098 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 130469 | \n",
+ " 1256099 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 130470 | \n",
+ " 1256100 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ " 130471 | \n",
+ " 1256101 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " NaT | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
130472 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " customer_id nb_campaigns nb_campaigns_opened \\\n",
+ "0 2 4 0.0 \n",
+ "1 3 222 124.0 \n",
+ "2 4 7 7.0 \n",
+ "3 5 4 0.0 \n",
+ "4 6 20 0.0 \n",
+ "... ... ... ... \n",
+ "130467 1256097 1 1.0 \n",
+ "130468 1256098 1 0.0 \n",
+ "130469 1256099 1 0.0 \n",
+ "130470 1256100 1 0.0 \n",
+ "130471 1256101 1 0.0 \n",
+ "\n",
+ " time_to_open \n",
+ "0 NaT \n",
+ "1 1 days 00:28:30.169354838 \n",
+ "2 1 days 04:31:01.428571428 \n",
+ "3 NaT \n",
+ "4 NaT \n",
+ "... ... \n",
+ "130467 0 days 02:11:15 \n",
+ "130468 NaT \n",
+ "130469 NaT \n",
+ "130470 NaT \n",
+ "130471 NaT \n",
+ "\n",
+ "[130472 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_campaigns_kpi"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Brouillon_AJ.ipynb b/Brouillon_AJ.ipynb
new file mode 100644
index 0000000..8f5529a
--- /dev/null
+++ b/Brouillon_AJ.ipynb
@@ -0,0 +1,695 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
+ "metadata": {},
+ "source": [
+ "# Business Data Challenge - Team 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
+ "metadata": {},
+ "source": [
+ "Configuration de l'accès aux données"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import s3fs\n",
+ "# Create filesystem object\n",
+ "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
+ "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
+ "\n",
+ "BUCKET = \"bdc2324-data\"\n",
+ "fs.ls(BUCKET)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Chargement des fichiers campaign_stats.csv\n",
+ "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Conversion des dates 'sent_at'\n",
+ "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
+ "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
+ "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
+ "print(campaign_stats_1['sent_at'].max())\n",
+ "print(campaign_stats_1['sent_at'].min())\n",
+ "\n",
+ "print(campaign_stats_2['sent_at'].max())\n",
+ "print(campaign_stats_2['sent_at'].min())\n",
+ "\n",
+ "print(campaign_stats_3['sent_at'].max())\n",
+ "print(campaign_stats_3['sent_at'].min())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "campaign_stats_1['sent_at']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "31f2edbf-5661-4516-9835-06d4da615c13",
+ "metadata": {},
+ "source": [
+ "### Customersplus.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "460f853a-68c0-42a7-9877-b83d3aaec813",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "customers_plus_1.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "customers_plus_1.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "customers_plus_1['id'].nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "customers_plus_2['id'].nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b40a653e-013f-48d0-8b57-0284587b36c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32fa2215-3c79-40b5-8643-755865959fc7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
+ "# Exemple id commun = caractéristiques communes\n",
+ "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
+ "\n",
+ "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "customers_plus_1.isna().mean()*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6f6ce60d-0912-497d-9108-330acccef394",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Chargement de toutes les données\n",
+ "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
+ "\n",
+ "for nom_base in liste_base:\n",
+ " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
+ " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Jointure\n",
+ "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
+ "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
+ "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
+ "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
+ "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
+ "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
+ "df_customer_event"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f",
+ "metadata": {},
+ "source": [
+ "# Fusion et exploration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22bfad2b-d52a-4077-9b39-bee35004e01c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Jointure\n",
+ "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
+ "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
+ "\n",
+ "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
+ "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
+ "\n",
+ "var_choosed.remove('representation_id')\n",
+ "var_choosed.extend(['start_date_time', 'event_id'])\n",
+ "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
+ "\n",
+ "var_choosed.remove('event_id')\n",
+ "var_choosed.extend(['name', 'customer_id'])\n",
+ "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
+ "\n",
+ "# Changement de nom\n",
+ "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
+ "var_choosed[var_choosed.index('name')] = \"event_name\"\n",
+ "\n",
+ "# Base finale\n",
+ "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
+ "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
+ "df_customer_event"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31",
+ "metadata": {},
+ "source": [
+ "## Type de client au globale"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Client\n",
+ "print(customer_target_mappings.columns)\n",
+ "print(customer_target_mappings.shape)\n",
+ "customer_target_mappings.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "customer_target_mappings['extra_field'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "customer_target_mappings['name'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Segmentation existante\n",
+ "print(target_types.columns)\n",
+ "print(target_types.shape)\n",
+ "target_types.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5adb1773-648d-4683-bc08-d1f2298c1283",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "target_types"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3d65f74e-47fc-4296-b493-a1ebefb91cde",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Tags = clients\n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " tags = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(tags.columns)\n",
+ "print(tags.shape)\n",
+ "tags.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8a689a63-165b-4c4e-bbb0-695b661048d9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tags"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "69e38c52-0570-4531-aebb-9deb6db8c40b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Structure = clients\n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(structure_tag_mappings.columns)\n",
+ "print(structure_tag_mappings.shape)\n",
+ "structure_tag_mappings.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "74dc34ad-375b-48df-a900-40d92c5fff13",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "structure_tag_mappings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Tags = clients\n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " customersplus = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(customersplus.columns)\n",
+ "print(customersplus.shape)\n",
+ "customersplus.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "383e892c-606a-45ce-bdd6-b503b3e0be33",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "customersplus"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "70324d06-b855-4386-a7de-eef1eb13dfdf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# tickets\n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " tickets = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(tickets.columns)\n",
+ "print(tickets.shape)\n",
+ "tickets.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ea83ea5c-3d47-4a66-a523-04b69b149a20",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tickets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tickets['type_of'].unique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bc192b08-30a5-486a-8bea-93e765dbfce6",
+ "metadata": {},
+ "source": [
+ "## Types d'évenement et client"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e14dcf62-2def-4ed5-834b-cf21abbc2894",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Evenement = events.csv\n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " events = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(events.columns)\n",
+ "print(events.shape)\n",
+ "events.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "events"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "af80eee8-f717-4159-a0fd-09d47ec96621",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "events['name'].nunique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Représentation des évenements = representations.csv\n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " representations = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(representations.columns)\n",
+ "print(representations.shape)\n",
+ "representations.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1487402a-a49b-4737-b7d7-40c764d2f0b4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "representations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99b27418-2c15-4a6e-bcf5-d329ca492085",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Produits vendues = products.csv\n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " products = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(products.columns)\n",
+ "print(products.shape)\n",
+ "products.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c49bcd47-672f-4e0f-aee9-a7475151b97f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "products"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a4aec5ce-d0c9-4625-bb29-9ac154818621",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Lieu = facilities.csv\n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " facilities = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(facilities.columns)\n",
+ "print(facilities.shape)\n",
+ "facilities.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b3642483-2879-442a-ad69-efcd2331a200",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "facilities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "da1e9807-2a8d-4be7-a785-55cffd734f36",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Saisons = seasons.csv période sur deux années consécutives\n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " seasons = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(seasons.columns)\n",
+ "print(seasons.shape)\n",
+ "seasons.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "seasons['name'].unique()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "abb3aa20-774b-4761-983a-df5eb2bc51c6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Achats = purchases.csv \n",
+ "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
+ "\n",
+ "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
+ " purchases = pd.read_csv(file_in, sep=\",\")\n",
+ "\n",
+ "print(purchases.columns)\n",
+ "print(purchases.shape)\n",
+ "purchases.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "30e204ab-4f63-430c-a818-5c8035b6e17b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "purchases"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Clean-Notebook.ipynb b/Clean-Notebook.ipynb
deleted file mode 100644
index 1f70494..0000000
--- a/Clean-Notebook.ipynb
+++ /dev/null
@@ -1,3921 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
- "metadata": {},
- "source": [
- "# Business Data Challenge - Team 1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "15103481-8d74-404c-aa09-7601fe7730da",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np\n",
- "import os\n",
- "import s3fs\n",
- "import re"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
- "metadata": {},
- "source": [
- "Configuration de l'accès aux données"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Create filesystem object\n",
- "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
- "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
- "metadata": {},
- "source": [
- "# Exemple sur Company 1"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9d74b68f-ba07-4a15-9a27-dae931762d70",
- "metadata": {},
- "source": [
- "## Chargement données"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "id": "699664b9-eee4-4f8d-a207-e524526560c5",
- "metadata": {},
- "outputs": [],
- "source": [
- "BUCKET = \"bdc2324-data/1\"\n",
- "liste_database = fs.ls(BUCKET)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "0cb92854-903b-4efd-ac1b-197e29f044b4",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['bdc2324-data/1/1campaign_stats.csv', 'bdc2324-data/1/1campaigns.csv', 'bdc2324-data/1/1customer_target_mappings.csv', 'bdc2324-data/1/1customersplus.csv', 'bdc2324-data/1/1event_types.csv', 'bdc2324-data/1/1events.csv', 'bdc2324-data/1/1product_packs.csv', 'bdc2324-data/1/1products.csv', 'bdc2324-data/1/1products_groups.csv', 'bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1target_types.csv', 'bdc2324-data/1/1targets.csv', 'bdc2324-data/1/1tickets.csv']\n"
- ]
- }
- ],
- "source": [
- "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'customer', 'event', 'target', 'prod', 'campa']\n",
- "\n",
- "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
- "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
- "\n",
- "# Afficher le résultat\n",
- "print(liste_database_filtered)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_4561/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
- " df = pd.read_csv(file_in)\n"
- ]
- }
- ],
- "source": [
- "# loop to create dataframes from file 2\n",
- "files_path = liste_database_filtered\n",
- "\n",
- "client_number = files_path[0].split(\"/\")[1]\n",
- "df_prefix = \"df\" + str(client_number) + \"_\"\n",
- "\n",
- "for i in range(len(files_path)) :\n",
- " current_path = files_path[i]\n",
- " with fs.open(current_path, mode=\"rb\") as file_in:\n",
- " df = pd.read_csv(file_in)\n",
- " # the pattern of the name is df1xxx\n",
- " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
- " globals()[nom_dataframe] = df"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "7d1da9df-f423-4a9f-a2a6-6d8ceeab1c34",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "_\n",
- "__\n",
- "___\n",
- "df\n",
- "df1_purchases\n",
- "df1_suppliers\n",
- "df1_tickets\n",
- "dataframe\n",
- "_7\n",
- "_10\n",
- "_11\n",
- "_18\n",
- "_20\n",
- "df1_customer_target_mappings\n",
- "df1_customersplus\n",
- "df1_event_types\n",
- "df1_events\n",
- "df1_target_types\n",
- "df1_targets\n"
- ]
- }
- ],
- "source": [
- "# Obtenir toutes les variables globales\n",
- "variables_globales = globals()\n",
- "\n",
- "# Filtrer les variables pour obtenir uniquement les DataFrames\n",
- "dataframes = {nom: variable for nom, variable in variables_globales.items() if isinstance(variable, pd.DataFrame)}\n",
- "\n",
- "# Afficher les noms et les DataFrames\n",
- "for nom, dataframe in dataframes.items():\n",
- " print(f\"{nom}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52",
- "metadata": {},
- "source": [
- "## suppliers.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "2e0dada0-9457-484c-aa55-77e44613ecca",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " name | \n",
- " manually_added | \n",
- " label | \n",
- " itr | \n",
- " updated_at | \n",
- " created_at | \n",
- " commission | \n",
- " identifier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1617 | \n",
- " j4 administration | \n",
- " False | \n",
- " NaN | \n",
- " NaN | \n",
- " 2021-07-29 09:21:37.325772+02:00 | \n",
- " 2021-07-29 09:21:37.325772+02:00 | \n",
- " NaN | \n",
- " 5958b2a060ac3e31678b438892a1bd2e | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 8 | \n",
- " non défini | \n",
- " False | \n",
- " NaN | \n",
- " NaN | \n",
- " 2020-09-03 13:16:35.329062+02:00 | \n",
- " 2020-09-03 13:16:35.329062+02:00 | \n",
- " NaN | \n",
- " 52ff3466787b4d538407372e5f7afe0f | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 4 | \n",
- " vad | \n",
- " False | \n",
- " NaN | \n",
- " NaN | \n",
- " 2020-09-03 13:11:23.896992+02:00 | \n",
- " 2020-09-03 13:11:23.896992+02:00 | \n",
- " NaN | \n",
- " 1225483c97b36018cab2bea14ab78ea6 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 1 | \n",
- " fort saint jean | \n",
- " False | \n",
- " NaN | \n",
- " NaN | \n",
- " 2020-09-03 13:11:23.833073+02:00 | \n",
- " 2020-09-03 13:11:23.833073+02:00 | \n",
- " NaN | \n",
- " 001b9b4a524fe407150b8235b304d4ec | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2 | \n",
- " j4 | \n",
- " False | \n",
- " NaN | \n",
- " NaN | \n",
- " 2020-09-03 13:11:23.888993+02:00 | \n",
- " 2020-09-03 13:11:23.888993+02:00 | \n",
- " NaN | \n",
- " 6a0cf6edf20060344b465706b61719aa | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 5 | \n",
- " revendeur | \n",
- " False | \n",
- " NaN | \n",
- " NaN | \n",
- " 2020-09-03 13:11:23.900987+02:00 | \n",
- " 2020-09-03 13:11:23.900987+02:00 | \n",
- " NaN | \n",
- " 931239d4acb6214d7e5c98edecfb4916 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 3 | \n",
- " vente en ligne | \n",
- " False | \n",
- " NaN | \n",
- " NaN | \n",
- " 2020-09-03 13:11:23.893097+02:00 | \n",
- " 2020-09-03 13:11:23.893097+02:00 | \n",
- " NaN | \n",
- " bde8f2ccff510df8572d3214d86b837d | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 6 | \n",
- " ccr | \n",
- " False | \n",
- " NaN | \n",
- " NaN | \n",
- " 2020-09-03 13:11:23.904974+02:00 | \n",
- " 2020-09-03 13:11:23.904974+02:00 | \n",
- " NaN | \n",
- " b48ec279411f7dbbb68393c61a9724d9 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 7 | \n",
- " dab | \n",
- " False | \n",
- " NaN | \n",
- " NaN | \n",
- " 2020-09-03 13:11:23.908970+02:00 | \n",
- " 2020-09-03 13:11:23.908970+02:00 | \n",
- " NaN | \n",
- " 11c6d471fa4e354e62e684d293694202 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id name manually_added label itr \\\n",
- "0 1617 j4 administration False NaN NaN \n",
- "1 8 non défini False NaN NaN \n",
- "2 4 vad False NaN NaN \n",
- "3 1 fort saint jean False NaN NaN \n",
- "4 2 j4 False NaN NaN \n",
- "5 5 revendeur False NaN NaN \n",
- "6 3 vente en ligne False NaN NaN \n",
- "7 6 ccr False NaN NaN \n",
- "8 7 dab False NaN NaN \n",
- "\n",
- " updated_at created_at \\\n",
- "0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n",
- "1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n",
- "2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n",
- "3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n",
- "4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n",
- "5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n",
- "6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n",
- "7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n",
- "8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n",
- "\n",
- " commission identifier \n",
- "0 NaN 5958b2a060ac3e31678b438892a1bd2e \n",
- "1 NaN 52ff3466787b4d538407372e5f7afe0f \n",
- "2 NaN 1225483c97b36018cab2bea14ab78ea6 \n",
- "3 NaN 001b9b4a524fe407150b8235b304d4ec \n",
- "4 NaN 6a0cf6edf20060344b465706b61719aa \n",
- "5 NaN 931239d4acb6214d7e5c98edecfb4916 \n",
- "6 NaN bde8f2ccff510df8572d3214d86b837d \n",
- "7 NaN b48ec279411f7dbbb68393c61a9724d9 \n",
- "8 NaN 11c6d471fa4e354e62e684d293694202 "
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Restriction aux DataFrame : ticket, purchase, consumption, suppliers\n",
- "df1_suppliers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "b583be02-ab60-4e14-9325-0204f203a1af",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 9 entries, 0 to 8\n",
- "Data columns (total 9 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 9 non-null int64 \n",
- " 1 name 9 non-null object \n",
- " 2 manually_added 9 non-null bool \n",
- " 3 label 0 non-null float64\n",
- " 4 itr 0 non-null float64\n",
- " 5 updated_at 9 non-null object \n",
- " 6 created_at 9 non-null object \n",
- " 7 commission 0 non-null float64\n",
- " 8 identifier 9 non-null object \n",
- "dtypes: bool(1), float64(3), int64(1), object(4)\n",
- "memory usage: 713.0+ bytes\n"
- ]
- }
- ],
- "source": [
- "df1_suppliers.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " id | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " name | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " manually_added | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " label | \n",
- " 100.0 | \n",
- "
\n",
- " \n",
- " itr | \n",
- " 100.0 | \n",
- "
\n",
- " \n",
- " updated_at | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " created_at | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- " commission | \n",
- " 100.0 | \n",
- "
\n",
- " \n",
- " identifier | \n",
- " 0.0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " 0\n",
- "id 0.0\n",
- "name 0.0\n",
- "manually_added 0.0\n",
- "label 100.0\n",
- "itr 100.0\n",
- "updated_at 0.0\n",
- "created_at 0.0\n",
- "commission 100.0\n",
- "identifier 0.0"
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pd.DataFrame(df1_suppliers.isna().mean()*100)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72",
- "metadata": {},
- "source": [
- "## purchases.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "id": "79c9eb43-002e-460d-acb2-206ebb2ab6dd",
- "metadata": {},
- "source": [
- "## tickets.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618",
- "metadata": {},
- "outputs": [],
- "source": [
- "df1_purchases\n",
- "df1_tickets"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "355f5489-7904-4161-a85b-6eb70b3a4c89",
- "metadata": {
- "jp-MarkdownHeadingCollapsed": true
- },
- "source": [
- "# Fusion et exploration"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "afe548fe-d93c-4634-9f53-881404ec4c6c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id_x | \n",
- " purchase_date | \n",
- " type_of | \n",
- " is_from_subscription | \n",
- " amount | \n",
- " is_full_price | \n",
- " start_date_time | \n",
- " event_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 992423 | \n",
- " 2023-01-11 17:08:41+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 13.0 | \n",
- " False | \n",
- " 2023-02-06 20:00:00+01:00 | \n",
- " zaide | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 992423 | \n",
- " 2023-01-11 17:08:41+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 13.0 | \n",
- " False | \n",
- " 2023-02-06 20:00:00+01:00 | \n",
- " zaide | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 1053934 | \n",
- " 2023-03-16 16:23:10+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 62.0 | \n",
- " False | \n",
- " 2023-03-19 16:00:00+01:00 | \n",
- " luisa miller | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 1053934 | \n",
- " 2023-03-16 16:23:10+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 62.0 | \n",
- " False | \n",
- " 2023-03-19 16:00:00+01:00 | \n",
- " luisa miller | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 1189141 | \n",
- " 2020-11-26 13:12:53+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 51.3 | \n",
- " False | \n",
- " 2020-12-01 20:00:00+01:00 | \n",
- " iphigenie en tauride | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 318964 | \n",
- " 1090839 | \n",
- " 2019-05-19 21:18:36+02:00 | \n",
- " 1 | \n",
- " False | \n",
- " 4.5 | \n",
- " False | \n",
- " 2019-05-27 20:00:00+02:00 | \n",
- " entre femmes | \n",
- "
\n",
- " \n",
- " 318965 | \n",
- " 1090839 | \n",
- " 2019-05-19 21:18:36+02:00 | \n",
- " 1 | \n",
- " False | \n",
- " 4.5 | \n",
- " False | \n",
- " 2019-05-27 20:00:00+02:00 | \n",
- " entre femmes | \n",
- "
\n",
- " \n",
- " 318966 | \n",
- " 1090839 | \n",
- " 2019-05-19 21:18:36+02:00 | \n",
- " 1 | \n",
- " False | \n",
- " 4.5 | \n",
- " False | \n",
- " 2019-05-27 20:00:00+02:00 | \n",
- " entre femmes | \n",
- "
\n",
- " \n",
- " 318967 | \n",
- " 1244277 | \n",
- " 2019-12-31 11:04:07+01:00 | \n",
- " 1 | \n",
- " False | \n",
- " 5.5 | \n",
- " False | \n",
- " 2020-02-03 20:00:00+01:00 | \n",
- " a boire et a manger | \n",
- "
\n",
- " \n",
- " 318968 | \n",
- " 1244277 | \n",
- " 2019-12-31 11:04:07+01:00 | \n",
- " 1 | \n",
- " False | \n",
- " 5.5 | \n",
- " False | \n",
- " 2020-02-03 20:00:00+01:00 | \n",
- " a boire et a manger | \n",
- "
\n",
- " \n",
- "
\n",
- "
318969 rows × 8 columns
\n",
- "
"
- ],
- "text/plain": [
- " id_x purchase_date type_of is_from_subscription \\\n",
- "0 992423 2023-01-11 17:08:41+01:00 3 False \n",
- "1 992423 2023-01-11 17:08:41+01:00 3 False \n",
- "2 1053934 2023-03-16 16:23:10+01:00 3 False \n",
- "3 1053934 2023-03-16 16:23:10+01:00 3 False \n",
- "4 1189141 2020-11-26 13:12:53+01:00 3 False \n",
- "... ... ... ... ... \n",
- "318964 1090839 2019-05-19 21:18:36+02:00 1 False \n",
- "318965 1090839 2019-05-19 21:18:36+02:00 1 False \n",
- "318966 1090839 2019-05-19 21:18:36+02:00 1 False \n",
- "318967 1244277 2019-12-31 11:04:07+01:00 1 False \n",
- "318968 1244277 2019-12-31 11:04:07+01:00 1 False \n",
- "\n",
- " amount is_full_price start_date_time event_name \n",
- "0 13.0 False 2023-02-06 20:00:00+01:00 zaide \n",
- "1 13.0 False 2023-02-06 20:00:00+01:00 zaide \n",
- "2 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n",
- "3 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n",
- "4 51.3 False 2020-12-01 20:00:00+01:00 iphigenie en tauride \n",
- "... ... ... ... ... \n",
- "318964 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n",
- "318965 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n",
- "318966 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n",
- "318967 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n",
- "318968 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n",
- "\n",
- "[318969 rows x 8 columns]"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Jointure\n",
- "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
- "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
- "\n",
- "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
- "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
- "\n",
- "var_choosed.remove('representation_id')\n",
- "var_choosed.extend(['start_date_time', 'event_id'])\n",
- "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
- "\n",
- "var_choosed.remove('event_id')\n",
- "var_choosed.extend(['name', 'customer_id'])\n",
- "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
- "\n",
- "# Changement de nom\n",
- "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
- "var_choosed[var_choosed.index('name')] = \"event_name\"\n",
- "\n",
- "# Base finale\n",
- "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
- "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
- "df_customer_event"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "779da86b-ac61-4c61-88d2-fa1c0c19efce",
- "metadata": {},
- "source": [
- "## Type de client au globale"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "7c89d25f-ee42-4478-9ff0-ee64b781d5c8",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n",
- " 'extra_field'],\n",
- " dtype='object')\n",
- "(124302, 7)\n",
- "\n",
- "RangeIndex: 124302 entries, 0 to 124301\n",
- "Data columns (total 7 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 124302 non-null int64 \n",
- " 1 customer_id 124302 non-null int64 \n",
- " 2 target_id 124302 non-null int64 \n",
- " 3 created_at 124296 non-null object \n",
- " 4 updated_at 124296 non-null object \n",
- " 5 name 0 non-null float64\n",
- " 6 extra_field 0 non-null float64\n",
- "dtypes: float64(2), int64(3), object(2)\n",
- "memory usage: 6.6+ MB\n"
- ]
- }
- ],
- "source": [
- "# Client\n",
- "print(customer_target_mappings.columns)\n",
- "print(customer_target_mappings.shape)\n",
- "customer_target_mappings.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "c4b6bdcc-9f13-449b-9a8b-c5ca794637be",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([nan])"
- ]
- },
- "execution_count": 26,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "customer_target_mappings['extra_field'].unique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "47bc8453-0693-4838-8bd8-4d800a82c496",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([nan])"
- ]
- },
- "execution_count": 27,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "customer_target_mappings['name'].unique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
- "(4, 6)\n",
- "\n",
- "RangeIndex: 4 entries, 0 to 3\n",
- "Data columns (total 6 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 4 non-null int64 \n",
- " 1 is_import 4 non-null bool \n",
- " 2 name 4 non-null object\n",
- " 3 created_at 4 non-null object\n",
- " 4 updated_at 4 non-null object\n",
- " 5 identifier 4 non-null object\n",
- "dtypes: bool(1), int64(1), object(4)\n",
- "memory usage: 292.0+ bytes\n"
- ]
- }
- ],
- "source": [
- "# Segmentation existante\n",
- "print(target_types.columns)\n",
- "print(target_types.shape)\n",
- "target_types.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5",
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " is_import | \n",
- " name | \n",
- " created_at | \n",
- " updated_at | \n",
- " identifier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1 | \n",
- " False | \n",
- " manual_static_filter | \n",
- " 2021-04-29 13:42:14.111085+02:00 | \n",
- " 2021-04-29 13:42:14.111085+02:00 | \n",
- " fb27e81baa4debc6a4e1a8639c20e808 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 3 | \n",
- " True | \n",
- " manual_structure | \n",
- " 2021-05-07 15:20:00.626650+02:00 | \n",
- " 2021-05-07 15:20:00.626650+02:00 | \n",
- " 382bca214204a2d3462f5ec2728d5d1e | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 6 | \n",
- " False | \n",
- " manual_dynamic_filter | \n",
- " 2021-09-09 14:27:47.641302+02:00 | \n",
- " 2021-09-09 14:27:47.641302+02:00 | \n",
- " e0f4b8693184850fefd6d2a38f10584e | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2 | \n",
- " True | \n",
- " manual_import | \n",
- " 2021-04-29 13:49:30.107110+02:00 | \n",
- " 2021-04-29 13:49:30.107110+02:00 | \n",
- " 12213df2ce68a624e4c0070521437bac | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id is_import name created_at \\\n",
- "0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n",
- "1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n",
- "2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n",
- "3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n",
- "\n",
- " updated_at identifier \n",
- "0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n",
- "1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n",
- "2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n",
- "3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac "
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "target_types"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "8dd74e87-97c2-493d-b19f-971b684078d3",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n",
- "(20, 5)\n",
- "\n",
- "RangeIndex: 20 entries, 0 to 19\n",
- "Data columns (total 5 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 20 non-null int64 \n",
- " 1 name 19 non-null object\n",
- " 2 created_at 20 non-null object\n",
- " 3 updated_at 20 non-null object\n",
- " 4 identifier 20 non-null object\n",
- "dtypes: int64(1), object(4)\n",
- "memory usage: 928.0+ bytes\n"
- ]
- }
- ],
- "source": [
- "# Tags = clients\n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " tags = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(tags.columns)\n",
- "print(tags.shape)\n",
- "tags.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "91d54732-666c-4250-ba91-5c9b83d4712a",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " name | \n",
- " created_at | \n",
- " updated_at | \n",
- " identifier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2 | \n",
- " ens-écoles | \n",
- " 2021-05-07 15:24:19.808501+02:00 | \n",
- " 2021-05-07 15:24:19.808501+02:00 | \n",
- " b6a360c5f84595940c5774f13fd39cc3 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 1 | \n",
- " NaN | \n",
- " 2021-05-07 15:24:19.805589+02:00 | \n",
- " 2021-05-07 15:24:19.805589+02:00 | \n",
- " d41d8cd98f00b204e9800998ecf8427e | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 4 | \n",
- " ecoles primaires rennes | \n",
- " 2021-05-07 15:29:06.388415+02:00 | \n",
- " 2021-05-07 15:29:06.388415+02:00 | \n",
- " ca8649dd64c240d118f60b07d11a7053 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 5 | \n",
- " Angers Nantes Opéra | \n",
- " 2023-01-27 15:59:58.187557+01:00 | \n",
- " 2023-01-27 15:59:58.187557+01:00 | \n",
- " f8f500f937fe312542399299cdc13f7e | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 6 | \n",
- " Opéras | \n",
- " 2023-01-27 16:03:59.654938+01:00 | \n",
- " 2023-01-27 16:03:59.654938+01:00 | \n",
- " 22eb2c616983ec7b54a093f84b230505 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 7 | \n",
- " Ministère de la Culture | \n",
- " 2023-01-30 11:22:29.636813+01:00 | \n",
- " 2023-01-30 11:22:29.636813+01:00 | \n",
- " 1b8c5c08fde000d90905a3d14af7763d | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 8 | \n",
- " Orchestres | \n",
- " 2023-01-30 11:33:56.392799+01:00 | \n",
- " 2023-01-30 11:33:56.392799+01:00 | \n",
- " 7c2aee0c80642d7e325a450f2dec45e5 | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 9 | \n",
- " Cooperative | \n",
- " 2023-01-31 14:44:38.471146+01:00 | \n",
- " 2023-01-31 14:44:38.471146+01:00 | \n",
- " 6c88c36ffaab88d255865aa3111d7686 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 10 | \n",
- " Théâtres | \n",
- " 2023-01-31 14:45:17.804428+01:00 | \n",
- " 2023-01-31 14:45:17.804428+01:00 | \n",
- " b2c19672df82021702b79482c8cda85a | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 11 | \n",
- " La co[opera]tive | \n",
- " 2023-02-16 17:11:35.004478+01:00 | \n",
- " 2023-02-16 17:11:35.004478+01:00 | \n",
- " 5dbaa3a1f278c0fcf981d447ad20957a | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " 12 | \n",
- " Ville de Rennes | \n",
- " 2023-02-16 17:37:13.816196+01:00 | \n",
- " 2023-02-16 17:37:13.816196+01:00 | \n",
- " bc483d04d9c3a08f167a3ce64366ca72 | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " 13 | \n",
- " Ensembles en résidence | \n",
- " 2023-02-16 17:55:54.877374+01:00 | \n",
- " 2023-02-16 17:55:54.877374+01:00 | \n",
- " e70635e771de13268dccf02bb2abfaf9 | \n",
- "
\n",
- " \n",
- " 12 | \n",
- " 14 | \n",
- " Ministère | \n",
- " 2023-02-17 11:17:54.429462+01:00 | \n",
- " 2023-02-17 11:17:54.429462+01:00 | \n",
- " a3f0582853fd19f5b57e3651f8a20e7a | \n",
- "
\n",
- " \n",
- " 13 | \n",
- " 15 | \n",
- " Rennes métropole | \n",
- " 2023-02-17 11:53:24.490786+01:00 | \n",
- " 2023-02-17 11:53:24.490786+01:00 | \n",
- " e98b8db5941b96c29c353b6f2f502055 | \n",
- "
\n",
- " \n",
- " 14 | \n",
- " 16 | \n",
- " Ville de Rennes - équipements culturels | \n",
- " 2023-02-17 12:00:10.649104+01:00 | \n",
- " 2023-02-17 12:00:10.649104+01:00 | \n",
- " a44edffc7edb852982efa7f4aa6d0e25 | \n",
- "
\n",
- " \n",
- " 15 | \n",
- " 17 | \n",
- " Structures culturelles rennaises | \n",
- " 2023-02-17 12:05:55.583016+01:00 | \n",
- " 2023-02-17 12:05:55.583016+01:00 | \n",
- " 241550517e4e3b1c926e9aeab0f621cd | \n",
- "
\n",
- " \n",
- " 16 | \n",
- " 18 | \n",
- " Université Rennes 2 | \n",
- " 2023-02-17 14:23:44.832959+01:00 | \n",
- " 2023-02-17 14:23:44.832959+01:00 | \n",
- " 4057c5cee51c4e10aa819f0cf48adc3f | \n",
- "
\n",
- " \n",
- " 17 | \n",
- " 19 | \n",
- " Centres chorégraphiques nationaux | \n",
- " 2023-02-17 15:29:41.827321+01:00 | \n",
- " 2023-02-17 15:29:41.827321+01:00 | \n",
- " 41e75941dfb766365498d917abe0102f | \n",
- "
\n",
- " \n",
- " 18 | \n",
- " 20 | \n",
- " Télévision | \n",
- " 2023-02-17 15:46:13.746092+01:00 | \n",
- " 2023-02-17 15:46:13.746092+01:00 | \n",
- " 36d6409c539dd79c1f3af8c5948603eb | \n",
- "
\n",
- " \n",
- " 19 | \n",
- " 21 | \n",
- " structures culturelles nationales | \n",
- " 2023-02-17 15:56:00.555722+01:00 | \n",
- " 2023-02-17 15:56:00.555722+01:00 | \n",
- " 5311cf7e42aac53289e1c4a338d5cfa4 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id name \\\n",
- "0 2 ens-écoles \n",
- "1 1 NaN \n",
- "2 4 ecoles primaires rennes \n",
- "3 5 Angers Nantes Opéra \n",
- "4 6 Opéras \n",
- "5 7 Ministère de la Culture \n",
- "6 8 Orchestres \n",
- "7 9 Cooperative \n",
- "8 10 Théâtres \n",
- "9 11 La co[opera]tive \n",
- "10 12 Ville de Rennes \n",
- "11 13 Ensembles en résidence \n",
- "12 14 Ministère \n",
- "13 15 Rennes métropole \n",
- "14 16 Ville de Rennes - équipements culturels \n",
- "15 17 Structures culturelles rennaises \n",
- "16 18 Université Rennes 2 \n",
- "17 19 Centres chorégraphiques nationaux \n",
- "18 20 Télévision \n",
- "19 21 structures culturelles nationales \n",
- "\n",
- " created_at updated_at \\\n",
- "0 2021-05-07 15:24:19.808501+02:00 2021-05-07 15:24:19.808501+02:00 \n",
- "1 2021-05-07 15:24:19.805589+02:00 2021-05-07 15:24:19.805589+02:00 \n",
- "2 2021-05-07 15:29:06.388415+02:00 2021-05-07 15:29:06.388415+02:00 \n",
- "3 2023-01-27 15:59:58.187557+01:00 2023-01-27 15:59:58.187557+01:00 \n",
- "4 2023-01-27 16:03:59.654938+01:00 2023-01-27 16:03:59.654938+01:00 \n",
- "5 2023-01-30 11:22:29.636813+01:00 2023-01-30 11:22:29.636813+01:00 \n",
- "6 2023-01-30 11:33:56.392799+01:00 2023-01-30 11:33:56.392799+01:00 \n",
- "7 2023-01-31 14:44:38.471146+01:00 2023-01-31 14:44:38.471146+01:00 \n",
- "8 2023-01-31 14:45:17.804428+01:00 2023-01-31 14:45:17.804428+01:00 \n",
- "9 2023-02-16 17:11:35.004478+01:00 2023-02-16 17:11:35.004478+01:00 \n",
- "10 2023-02-16 17:37:13.816196+01:00 2023-02-16 17:37:13.816196+01:00 \n",
- "11 2023-02-16 17:55:54.877374+01:00 2023-02-16 17:55:54.877374+01:00 \n",
- "12 2023-02-17 11:17:54.429462+01:00 2023-02-17 11:17:54.429462+01:00 \n",
- "13 2023-02-17 11:53:24.490786+01:00 2023-02-17 11:53:24.490786+01:00 \n",
- "14 2023-02-17 12:00:10.649104+01:00 2023-02-17 12:00:10.649104+01:00 \n",
- "15 2023-02-17 12:05:55.583016+01:00 2023-02-17 12:05:55.583016+01:00 \n",
- "16 2023-02-17 14:23:44.832959+01:00 2023-02-17 14:23:44.832959+01:00 \n",
- "17 2023-02-17 15:29:41.827321+01:00 2023-02-17 15:29:41.827321+01:00 \n",
- "18 2023-02-17 15:46:13.746092+01:00 2023-02-17 15:46:13.746092+01:00 \n",
- "19 2023-02-17 15:56:00.555722+01:00 2023-02-17 15:56:00.555722+01:00 \n",
- "\n",
- " identifier \n",
- "0 b6a360c5f84595940c5774f13fd39cc3 \n",
- "1 d41d8cd98f00b204e9800998ecf8427e \n",
- "2 ca8649dd64c240d118f60b07d11a7053 \n",
- "3 f8f500f937fe312542399299cdc13f7e \n",
- "4 22eb2c616983ec7b54a093f84b230505 \n",
- "5 1b8c5c08fde000d90905a3d14af7763d \n",
- "6 7c2aee0c80642d7e325a450f2dec45e5 \n",
- "7 6c88c36ffaab88d255865aa3111d7686 \n",
- "8 b2c19672df82021702b79482c8cda85a \n",
- "9 5dbaa3a1f278c0fcf981d447ad20957a \n",
- "10 bc483d04d9c3a08f167a3ce64366ca72 \n",
- "11 e70635e771de13268dccf02bb2abfaf9 \n",
- "12 a3f0582853fd19f5b57e3651f8a20e7a \n",
- "13 e98b8db5941b96c29c353b6f2f502055 \n",
- "14 a44edffc7edb852982efa7f4aa6d0e25 \n",
- "15 241550517e4e3b1c926e9aeab0f621cd \n",
- "16 4057c5cee51c4e10aa819f0cf48adc3f \n",
- "17 41e75941dfb766365498d917abe0102f \n",
- "18 36d6409c539dd79c1f3af8c5948603eb \n",
- "19 5311cf7e42aac53289e1c4a338d5cfa4 "
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tags"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "4cc9f444-b7e6-4ee5-8ce8-64c63ab7825a",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'structure_id', 'tag_id', 'created_at', 'updated_at'], dtype='object')\n",
- "(179, 5)\n",
- "\n",
- "RangeIndex: 179 entries, 0 to 178\n",
- "Data columns (total 5 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 179 non-null int64 \n",
- " 1 structure_id 179 non-null int64 \n",
- " 2 tag_id 179 non-null int64 \n",
- " 3 created_at 179 non-null object\n",
- " 4 updated_at 179 non-null object\n",
- "dtypes: int64(3), object(2)\n",
- "memory usage: 7.1+ KB\n"
- ]
- }
- ],
- "source": [
- "# Structure = clients\n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(structure_tag_mappings.columns)\n",
- "print(structure_tag_mappings.shape)\n",
- "structure_tag_mappings.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "dcf776df-5c8e-4972-b2c1-b41291ba7e66",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " structure_id | \n",
- " tag_id | \n",
- " created_at | \n",
- " updated_at | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 123 | \n",
- " 187 | \n",
- " 6 | \n",
- " 2023-01-27 16:03:59.680222+01:00 | \n",
- " 2023-01-27 16:03:59.680222+01:00 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2021-05-07 15:24:19.872895+02:00 | \n",
- " 2021-05-07 15:24:19.872895+02:00 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 3 | \n",
- " 3 | \n",
- " 2 | \n",
- " 2021-05-07 15:24:19.873830+02:00 | \n",
- " 2021-05-07 15:24:19.873830+02:00 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 4 | \n",
- " 4 | \n",
- " 2 | \n",
- " 2021-05-07 15:24:19.874628+02:00 | \n",
- " 2021-05-07 15:24:19.874628+02:00 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 5 | \n",
- " 5 | \n",
- " 2 | \n",
- " 2021-05-07 15:24:19.875421+02:00 | \n",
- " 2021-05-07 15:24:19.875421+02:00 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 174 | \n",
- " 184 | \n",
- " 236 | \n",
- " 10 | \n",
- " 2023-02-17 16:35:25.041114+01:00 | \n",
- " 2023-02-17 16:35:25.041114+01:00 | \n",
- "
\n",
- " \n",
- " 175 | \n",
- " 185 | \n",
- " 237 | \n",
- " 17 | \n",
- " 2023-02-17 16:39:10.799478+01:00 | \n",
- " 2023-02-17 16:39:10.799478+01:00 | \n",
- "
\n",
- " \n",
- " 176 | \n",
- " 186 | \n",
- " 238 | \n",
- " 19 | \n",
- " 2023-02-17 16:53:21.098690+01:00 | \n",
- " 2023-02-17 16:53:21.098690+01:00 | \n",
- "
\n",
- " \n",
- " 177 | \n",
- " 187 | \n",
- " 239 | \n",
- " 10 | \n",
- " 2023-02-17 16:57:42.623481+01:00 | \n",
- " 2023-02-17 16:57:42.623481+01:00 | \n",
- "
\n",
- " \n",
- " 178 | \n",
- " 188 | \n",
- " 240 | \n",
- " 10 | \n",
- " 2023-02-17 16:59:22.067723+01:00 | \n",
- " 2023-02-17 16:59:22.067723+01:00 | \n",
- "
\n",
- " \n",
- "
\n",
- "
179 rows × 5 columns
\n",
- "
"
- ],
- "text/plain": [
- " id structure_id tag_id created_at \\\n",
- "0 123 187 6 2023-01-27 16:03:59.680222+01:00 \n",
- "1 2 2 2 2021-05-07 15:24:19.872895+02:00 \n",
- "2 3 3 2 2021-05-07 15:24:19.873830+02:00 \n",
- "3 4 4 2 2021-05-07 15:24:19.874628+02:00 \n",
- "4 5 5 2 2021-05-07 15:24:19.875421+02:00 \n",
- ".. ... ... ... ... \n",
- "174 184 236 10 2023-02-17 16:35:25.041114+01:00 \n",
- "175 185 237 17 2023-02-17 16:39:10.799478+01:00 \n",
- "176 186 238 19 2023-02-17 16:53:21.098690+01:00 \n",
- "177 187 239 10 2023-02-17 16:57:42.623481+01:00 \n",
- "178 188 240 10 2023-02-17 16:59:22.067723+01:00 \n",
- "\n",
- " updated_at \n",
- "0 2023-01-27 16:03:59.680222+01:00 \n",
- "1 2021-05-07 15:24:19.872895+02:00 \n",
- "2 2021-05-07 15:24:19.873830+02:00 \n",
- "3 2021-05-07 15:24:19.874628+02:00 \n",
- "4 2021-05-07 15:24:19.875421+02:00 \n",
- ".. ... \n",
- "174 2023-02-17 16:35:25.041114+01:00 \n",
- "175 2023-02-17 16:39:10.799478+01:00 \n",
- "176 2023-02-17 16:53:21.098690+01:00 \n",
- "177 2023-02-17 16:57:42.623481+01:00 \n",
- "178 2023-02-17 16:59:22.067723+01:00 \n",
- "\n",
- "[179 rows x 5 columns]"
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "structure_tag_mappings"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "41bf1529-5a7c-409e-9791-2024c08c11f0",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
- " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
- " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
- " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
- " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
- " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
- " 'average_purchase_delay', 'average_price_basket',\n",
- " 'average_ticket_basket', 'total_price', 'preferred_category',\n",
- " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
- " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
- " 'tenant_id'],\n",
- " dtype='object')\n",
- "(71307, 43)\n",
- "\n",
- "RangeIndex: 71307 entries, 0 to 71306\n",
- "Data columns (total 43 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 71307 non-null int64 \n",
- " 1 lastname 41045 non-null object \n",
- " 2 firstname 39140 non-null object \n",
- " 3 birthdate 18174 non-null object \n",
- " 4 email 58203 non-null object \n",
- " 5 street_id 71307 non-null int64 \n",
- " 6 created_at 71307 non-null object \n",
- " 7 updated_at 71307 non-null object \n",
- " 8 civility 0 non-null float64\n",
- " 9 is_partner 71307 non-null bool \n",
- " 10 extra 0 non-null float64\n",
- " 11 deleted_at 0 non-null float64\n",
- " 12 reference 0 non-null float64\n",
- " 13 gender 71307 non-null int64 \n",
- " 14 is_email_true 71307 non-null bool \n",
- " 15 extra_field 0 non-null float64\n",
- " 16 identifier 71307 non-null object \n",
- " 17 opt_in 71307 non-null bool \n",
- " 18 structure_id 616 non-null float64\n",
- " 19 note 451 non-null object \n",
- " 20 profession 812 non-null object \n",
- " 21 language 0 non-null float64\n",
- " 22 mcp_contact_id 22417 non-null float64\n",
- " 23 need_reload 71307 non-null bool \n",
- " 24 last_buying_date 34040 non-null object \n",
- " 25 max_price 34040 non-null float64\n",
- " 26 ticket_sum 71307 non-null int64 \n",
- " 27 average_price 68694 non-null float64\n",
- " 28 fidelity 71307 non-null int64 \n",
- " 29 average_purchase_delay 34040 non-null float64\n",
- " 30 average_price_basket 34040 non-null float64\n",
- " 31 average_ticket_basket 34040 non-null float64\n",
- " 32 total_price 36653 non-null float64\n",
- " 33 preferred_category 0 non-null float64\n",
- " 34 preferred_supplier 0 non-null float64\n",
- " 35 preferred_formula 0 non-null float64\n",
- " 36 purchase_count 71307 non-null int64 \n",
- " 37 first_buying_date 34040 non-null object \n",
- " 38 last_visiting_date 0 non-null float64\n",
- " 39 zipcode 33756 non-null object \n",
- " 40 country 39910 non-null object \n",
- " 41 age 18174 non-null float64\n",
- " 42 tenant_id 71307 non-null int64 \n",
- "dtypes: bool(4), float64(19), int64(7), object(13)\n",
- "memory usage: 21.5+ MB\n"
- ]
- }
- ],
- "source": [
- "# Tags = clients\n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " customersplus = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(customersplus.columns)\n",
- "print(customersplus.shape)\n",
- "customersplus.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "948a0b2b-8d1c-4afb-802e-670d67dd8c20",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " lastname | \n",
- " firstname | \n",
- " birthdate | \n",
- " email | \n",
- " street_id | \n",
- " created_at | \n",
- " updated_at | \n",
- " civility | \n",
- " is_partner | \n",
- " ... | \n",
- " preferred_category | \n",
- " preferred_supplier | \n",
- " preferred_formula | \n",
- " purchase_count | \n",
- " first_buying_date | \n",
- " last_visiting_date | \n",
- " zipcode | \n",
- " country | \n",
- " age | \n",
- " tenant_id | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 286834 | \n",
- " lastname286834 | \n",
- " firstname286834 | \n",
- " NaN | \n",
- " email286834 | \n",
- " 6 | \n",
- " 2022-05-19 10:09:09.361137+02:00 | \n",
- " 2022-05-19 10:09:09.361137+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " fr | \n",
- " NaN | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 330695 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " email330695 | \n",
- " 1 | \n",
- " 2022-07-16 04:10:34.135134+02:00 | \n",
- " 2022-07-16 04:10:34.156704+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 330978 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " email330978 | \n",
- " 1 | \n",
- " 2022-07-21 22:14:09.811721+02:00 | \n",
- " 2022-07-21 22:14:09.836051+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 338697 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " email338697 | \n",
- " 1 | \n",
- " 2022-09-15 19:02:03.950536+02:00 | \n",
- " 2022-09-15 19:02:03.985642+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 338726 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " email338726 | \n",
- " 1 | \n",
- " 2022-09-16 01:24:40.719882+02:00 | \n",
- " 2022-09-16 01:24:40.742753+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 71302 | \n",
- " 27105 | \n",
- " lastname27105 | \n",
- " firstname27105 | \n",
- " 1957-01-26 | \n",
- " email27105 | \n",
- " 205024 | \n",
- " 2021-04-22 15:12:59.986534+02:00 | \n",
- " 2023-09-12 18:59:31.613235+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 2 | \n",
- " 2018-12-31 18:56:57+01:00 | \n",
- " NaN | \n",
- " 35700 | \n",
- " fr | \n",
- " 66.0 | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- " 71303 | \n",
- " 27108 | \n",
- " lastname27108 | \n",
- " firstname27108 | \n",
- " NaN | \n",
- " NaN | \n",
- " 205024 | \n",
- " 2021-04-22 15:12:59.989197+02:00 | \n",
- " 2023-09-12 18:27:34.380843+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 6 | \n",
- " 2015-12-29 14:51:46+01:00 | \n",
- " NaN | \n",
- " 35700 | \n",
- " fr | \n",
- " NaN | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- " 71304 | \n",
- " 27110 | \n",
- " lastname27110 | \n",
- " firstname27110 | \n",
- " NaN | \n",
- " NaN | \n",
- " 6 | \n",
- " 2021-04-22 15:12:59.991029+02:00 | \n",
- " 2022-04-14 11:41:33.738500+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1 | \n",
- " 2018-12-31 19:12:59+01:00 | \n",
- " NaN | \n",
- " NaN | \n",
- " fr | \n",
- " NaN | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- " 71305 | \n",
- " 10607 | \n",
- " lastname10607 | \n",
- " firstname10607 | \n",
- " 1963-01-04 | \n",
- " email10607 | \n",
- " 313332 | \n",
- " 2021-04-22 14:56:45.742226+02:00 | \n",
- " 2023-09-12 17:55:17.723195+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 26 | \n",
- " 2015-10-10 14:11:21+02:00 | \n",
- " NaN | \n",
- " 35850 | \n",
- " fr | \n",
- " 60.0 | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- " 71306 | \n",
- " 19095 | \n",
- " lastname19095 | \n",
- " firstname19095 | \n",
- " 1979-07-16 | \n",
- " email19095 | \n",
- " 6 | \n",
- " 2021-04-22 15:06:30.120537+02:00 | \n",
- " 2023-09-12 18:27:36.904104+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 2 | \n",
- " 2019-05-19 21:18:36+02:00 | \n",
- " NaN | \n",
- " NaN | \n",
- " fr | \n",
- " 44.0 | \n",
- " 1556 | \n",
- "
\n",
- " \n",
- "
\n",
- "
71307 rows × 43 columns
\n",
- "
"
- ],
- "text/plain": [
- " id lastname firstname birthdate email \\\n",
- "0 286834 lastname286834 firstname286834 NaN email286834 \n",
- "1 330695 NaN NaN NaN email330695 \n",
- "2 330978 NaN NaN NaN email330978 \n",
- "3 338697 NaN NaN NaN email338697 \n",
- "4 338726 NaN NaN NaN email338726 \n",
- "... ... ... ... ... ... \n",
- "71302 27105 lastname27105 firstname27105 1957-01-26 email27105 \n",
- "71303 27108 lastname27108 firstname27108 NaN NaN \n",
- "71304 27110 lastname27110 firstname27110 NaN NaN \n",
- "71305 10607 lastname10607 firstname10607 1963-01-04 email10607 \n",
- "71306 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
- "\n",
- " street_id created_at \\\n",
- "0 6 2022-05-19 10:09:09.361137+02:00 \n",
- "1 1 2022-07-16 04:10:34.135134+02:00 \n",
- "2 1 2022-07-21 22:14:09.811721+02:00 \n",
- "3 1 2022-09-15 19:02:03.950536+02:00 \n",
- "4 1 2022-09-16 01:24:40.719882+02:00 \n",
- "... ... ... \n",
- "71302 205024 2021-04-22 15:12:59.986534+02:00 \n",
- "71303 205024 2021-04-22 15:12:59.989197+02:00 \n",
- "71304 6 2021-04-22 15:12:59.991029+02:00 \n",
- "71305 313332 2021-04-22 14:56:45.742226+02:00 \n",
- "71306 6 2021-04-22 15:06:30.120537+02:00 \n",
- "\n",
- " updated_at civility is_partner ... \\\n",
- "0 2022-05-19 10:09:09.361137+02:00 NaN False ... \n",
- "1 2022-07-16 04:10:34.156704+02:00 NaN False ... \n",
- "2 2022-07-21 22:14:09.836051+02:00 NaN False ... \n",
- "3 2022-09-15 19:02:03.985642+02:00 NaN False ... \n",
- "4 2022-09-16 01:24:40.742753+02:00 NaN False ... \n",
- "... ... ... ... ... \n",
- "71302 2023-09-12 18:59:31.613235+02:00 NaN False ... \n",
- "71303 2023-09-12 18:27:34.380843+02:00 NaN False ... \n",
- "71304 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
- "71305 2023-09-12 17:55:17.723195+02:00 NaN False ... \n",
- "71306 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
- "\n",
- " preferred_category preferred_supplier preferred_formula \\\n",
- "0 NaN NaN NaN \n",
- "1 NaN NaN NaN \n",
- "2 NaN NaN NaN \n",
- "3 NaN NaN NaN \n",
- "4 NaN NaN NaN \n",
- "... ... ... ... \n",
- "71302 NaN NaN NaN \n",
- "71303 NaN NaN NaN \n",
- "71304 NaN NaN NaN \n",
- "71305 NaN NaN NaN \n",
- "71306 NaN NaN NaN \n",
- "\n",
- " purchase_count first_buying_date last_visiting_date zipcode \\\n",
- "0 0 NaN NaN NaN \n",
- "1 0 NaN NaN NaN \n",
- "2 0 NaN NaN NaN \n",
- "3 0 NaN NaN NaN \n",
- "4 0 NaN NaN NaN \n",
- "... ... ... ... ... \n",
- "71302 2 2018-12-31 18:56:57+01:00 NaN 35700 \n",
- "71303 6 2015-12-29 14:51:46+01:00 NaN 35700 \n",
- "71304 1 2018-12-31 19:12:59+01:00 NaN NaN \n",
- "71305 26 2015-10-10 14:11:21+02:00 NaN 35850 \n",
- "71306 2 2019-05-19 21:18:36+02:00 NaN NaN \n",
- "\n",
- " country age tenant_id \n",
- "0 fr NaN 1556 \n",
- "1 NaN NaN 1556 \n",
- "2 NaN NaN 1556 \n",
- "3 NaN NaN 1556 \n",
- "4 NaN NaN 1556 \n",
- "... ... ... ... \n",
- "71302 fr 66.0 1556 \n",
- "71303 fr NaN 1556 \n",
- "71304 fr NaN 1556 \n",
- "71305 fr 60.0 1556 \n",
- "71306 fr 44.0 1556 \n",
- "\n",
- "[71307 rows x 43 columns]"
- ]
- },
- "execution_count": 25,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "customersplus"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2",
- "metadata": {},
- "outputs": [],
- "source": [
- "# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "8259ae6c-353f-43a6-add3-f974fac6e5d4",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'number', 'created_at', 'updated_at', 'purchase_id', 'product_id',\n",
- " 'is_from_subscription', 'type_of', 'supplier_id', 'barcode',\n",
- " 'identifier'],\n",
- " dtype='object')\n",
- "(318969, 11)\n",
- "\n",
- "RangeIndex: 318969 entries, 0 to 318968\n",
- "Data columns (total 11 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 318969 non-null int64 \n",
- " 1 number 318969 non-null object \n",
- " 2 created_at 318969 non-null object \n",
- " 3 updated_at 318969 non-null object \n",
- " 4 purchase_id 318969 non-null int64 \n",
- " 5 product_id 318969 non-null int64 \n",
- " 6 is_from_subscription 318969 non-null bool \n",
- " 7 type_of 318969 non-null int64 \n",
- " 8 supplier_id 318969 non-null int64 \n",
- " 9 barcode 0 non-null float64\n",
- " 10 identifier 318969 non-null object \n",
- "dtypes: bool(1), float64(1), int64(5), object(4)\n",
- "memory usage: 24.6+ MB\n"
- ]
- }
- ],
- "source": [
- "# tickets\n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " tickets = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(tickets.columns)\n",
- "print(tickets.shape)\n",
- "tickets.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "id": "f54830cb-1f95-4f71-9b04-358c745fb454",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " number | \n",
- " created_at | \n",
- " updated_at | \n",
- " purchase_id | \n",
- " product_id | \n",
- " is_from_subscription | \n",
- " type_of | \n",
- " supplier_id | \n",
- " barcode | \n",
- " identifier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2119081 | \n",
- " 1433_136_212_68356 | \n",
- " 2023-09-12 17:42:45.396336+02:00 | \n",
- " 2023-09-12 17:42:45.396336+02:00 | \n",
- " 861764 | \n",
- " 209879 | \n",
- " False | \n",
- " 1 | \n",
- " 1702 | \n",
- " NaN | \n",
- " f694c255855ce5643c6fcc7fed5e9237 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2119082 | \n",
- " 1433_136_194_68356 | \n",
- " 2023-09-12 17:42:45.409056+02:00 | \n",
- " 2023-09-12 17:42:45.409056+02:00 | \n",
- " 861763 | \n",
- " 209879 | \n",
- " False | \n",
- " 1 | \n",
- " 1702 | \n",
- " NaN | \n",
- " 838d6101db2fc8bc80536d8b91b49859 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2119083 | \n",
- " 33158_158_343_68357 | \n",
- " 2023-09-12 17:42:45.409824+02:00 | \n",
- " 2023-09-12 17:42:45.409824+02:00 | \n",
- " 861769 | \n",
- " 209880 | \n",
- " False | \n",
- " 1 | \n",
- " 1702 | \n",
- " NaN | \n",
- " 8a8d938d66a4dc57bcb44c2773c6fdfa | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2119084 | \n",
- " 33158_158_297_68357 | \n",
- " 2023-09-12 17:42:45.410447+02:00 | \n",
- " 2023-09-12 17:42:45.410447+02:00 | \n",
- " 861767 | \n",
- " 209880 | \n",
- " False | \n",
- " 1 | \n",
- " 1702 | \n",
- " NaN | \n",
- " b7a3dd0794c0957c942d45b8913e5b96 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2119085 | \n",
- " 33158_158_318_68357 | \n",
- " 2023-09-12 17:42:45.411059+02:00 | \n",
- " 2023-09-12 17:42:45.411059+02:00 | \n",
- " 861768 | \n",
- " 209880 | \n",
- " False | \n",
- " 1 | \n",
- " 1702 | \n",
- " NaN | \n",
- " d7ea7e443581ebe520dd13f6cad31af7 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 318964 | \n",
- " 2564021 | \n",
- " 44247_204_239_89278 | \n",
- " 2023-09-12 18:59:48.750953+02:00 | \n",
- " 2023-09-12 18:59:48.750953+02:00 | \n",
- " 1244281 | \n",
- " 210158 | \n",
- " False | \n",
- " 1 | \n",
- " 1702 | \n",
- " NaN | \n",
- " 82c9af8b2167f7ac34a5e834242b0239 | \n",
- "
\n",
- " \n",
- " 318965 | \n",
- " 2564022 | \n",
- " 44247_204_299_89278 | \n",
- " 2023-09-12 18:59:48.751441+02:00 | \n",
- " 2023-09-12 18:59:48.751441+02:00 | \n",
- " 1244284 | \n",
- " 210158 | \n",
- " False | \n",
- " 1 | \n",
- " 1702 | \n",
- " NaN | \n",
- " 235e8e608f066cb72949bbd397d0a76f | \n",
- "
\n",
- " \n",
- " 318966 | \n",
- " 2564023 | \n",
- " 44247_204_259_89278 | \n",
- " 2023-09-12 18:59:48.751924+02:00 | \n",
- " 2023-09-12 18:59:48.751924+02:00 | \n",
- " 1244282 | \n",
- " 210158 | \n",
- " False | \n",
- " 1 | \n",
- " 1702 | \n",
- " NaN | \n",
- " ec22fa828931f030f7e79a4cc5478c4b | \n",
- "
\n",
- " \n",
- " 318967 | \n",
- " 2564024 | \n",
- " 44247_204_279_89278 | \n",
- " 2023-09-12 18:59:48.752425+02:00 | \n",
- " 2023-09-12 18:59:48.752425+02:00 | \n",
- " 1244283 | \n",
- " 210158 | \n",
- " False | \n",
- " 1 | \n",
- " 1702 | \n",
- " NaN | \n",
- " 31ec4deaf718e04caf193e1ff8d621ef | \n",
- "
\n",
- " \n",
- " 318968 | \n",
- " 2513156 | \n",
- " 4854_178_2847_89170 | \n",
- " 2023-09-12 18:52:20.331807+02:00 | \n",
- " 2023-09-12 18:59:48.752904+02:00 | \n",
- " 1244285 | \n",
- " 261922 | \n",
- " False | \n",
- " 3 | \n",
- " 1702 | \n",
- " NaN | \n",
- " 48aef9efab29bfb1537656908863bcc1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
318969 rows × 11 columns
\n",
- "
"
- ],
- "text/plain": [
- " id number created_at \\\n",
- "0 2119081 1433_136_212_68356 2023-09-12 17:42:45.396336+02:00 \n",
- "1 2119082 1433_136_194_68356 2023-09-12 17:42:45.409056+02:00 \n",
- "2 2119083 33158_158_343_68357 2023-09-12 17:42:45.409824+02:00 \n",
- "3 2119084 33158_158_297_68357 2023-09-12 17:42:45.410447+02:00 \n",
- "4 2119085 33158_158_318_68357 2023-09-12 17:42:45.411059+02:00 \n",
- "... ... ... ... \n",
- "318964 2564021 44247_204_239_89278 2023-09-12 18:59:48.750953+02:00 \n",
- "318965 2564022 44247_204_299_89278 2023-09-12 18:59:48.751441+02:00 \n",
- "318966 2564023 44247_204_259_89278 2023-09-12 18:59:48.751924+02:00 \n",
- "318967 2564024 44247_204_279_89278 2023-09-12 18:59:48.752425+02:00 \n",
- "318968 2513156 4854_178_2847_89170 2023-09-12 18:52:20.331807+02:00 \n",
- "\n",
- " updated_at purchase_id product_id \\\n",
- "0 2023-09-12 17:42:45.396336+02:00 861764 209879 \n",
- "1 2023-09-12 17:42:45.409056+02:00 861763 209879 \n",
- "2 2023-09-12 17:42:45.409824+02:00 861769 209880 \n",
- "3 2023-09-12 17:42:45.410447+02:00 861767 209880 \n",
- "4 2023-09-12 17:42:45.411059+02:00 861768 209880 \n",
- "... ... ... ... \n",
- "318964 2023-09-12 18:59:48.750953+02:00 1244281 210158 \n",
- "318965 2023-09-12 18:59:48.751441+02:00 1244284 210158 \n",
- "318966 2023-09-12 18:59:48.751924+02:00 1244282 210158 \n",
- "318967 2023-09-12 18:59:48.752425+02:00 1244283 210158 \n",
- "318968 2023-09-12 18:59:48.752904+02:00 1244285 261922 \n",
- "\n",
- " is_from_subscription type_of supplier_id barcode \\\n",
- "0 False 1 1702 NaN \n",
- "1 False 1 1702 NaN \n",
- "2 False 1 1702 NaN \n",
- "3 False 1 1702 NaN \n",
- "4 False 1 1702 NaN \n",
- "... ... ... ... ... \n",
- "318964 False 1 1702 NaN \n",
- "318965 False 1 1702 NaN \n",
- "318966 False 1 1702 NaN \n",
- "318967 False 1 1702 NaN \n",
- "318968 False 3 1702 NaN \n",
- "\n",
- " identifier \n",
- "0 f694c255855ce5643c6fcc7fed5e9237 \n",
- "1 838d6101db2fc8bc80536d8b91b49859 \n",
- "2 8a8d938d66a4dc57bcb44c2773c6fdfa \n",
- "3 b7a3dd0794c0957c942d45b8913e5b96 \n",
- "4 d7ea7e443581ebe520dd13f6cad31af7 \n",
- "... ... \n",
- "318964 82c9af8b2167f7ac34a5e834242b0239 \n",
- "318965 235e8e608f066cb72949bbd397d0a76f \n",
- "318966 ec22fa828931f030f7e79a4cc5478c4b \n",
- "318967 31ec4deaf718e04caf193e1ff8d621ef \n",
- "318968 48aef9efab29bfb1537656908863bcc1 \n",
- "\n",
- "[318969 rows x 11 columns]"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tickets"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "ad743347-33d1-41f0-852d-f9e6354f82ed",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array([1, 3, 0])"
- ]
- },
- "execution_count": 33,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tickets['type_of'].unique()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b88808fe-3b4e-49ed-9885-d52910b6f211",
- "metadata": {},
- "source": [
- "## Types d'évenement et client"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "ecb03a47-1418-4fb1-8c78-cd222d38b7fd",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'created_at', 'updated_at', 'season_id', 'facility_id', 'name',\n",
- " 'event_type_id', 'manual_added', 'is_display', 'event_type_key_id',\n",
- " 'facility_key_id', 'identifier'],\n",
- " dtype='object')\n",
- "(403, 12)\n",
- "\n",
- "RangeIndex: 403 entries, 0 to 402\n",
- "Data columns (total 12 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 403 non-null int64 \n",
- " 1 created_at 403 non-null object\n",
- " 2 updated_at 403 non-null object\n",
- " 3 season_id 403 non-null int64 \n",
- " 4 facility_id 403 non-null int64 \n",
- " 5 name 403 non-null object\n",
- " 6 event_type_id 403 non-null int64 \n",
- " 7 manual_added 403 non-null bool \n",
- " 8 is_display 403 non-null bool \n",
- " 9 event_type_key_id 403 non-null int64 \n",
- " 10 facility_key_id 403 non-null int64 \n",
- " 11 identifier 403 non-null object\n",
- "dtypes: bool(2), int64(6), object(4)\n",
- "memory usage: 32.4+ KB\n"
- ]
- }
- ],
- "source": [
- "# Evenement = events.csv\n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " events = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(events.columns)\n",
- "print(events.shape)\n",
- "events.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "19706610-9e90-4e6f-8bd0-da124b87cff7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " created_at | \n",
- " updated_at | \n",
- " season_id | \n",
- " facility_id | \n",
- " name | \n",
- " event_type_id | \n",
- " manual_added | \n",
- " is_display | \n",
- " event_type_key_id | \n",
- " facility_key_id | \n",
- " identifier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 20367 | \n",
- " 2023-09-13 03:42:45.214293+02:00 | \n",
- " 2023-09-13 03:54:30.086969+02:00 | \n",
- " 1865 | \n",
- " 1054 | \n",
- " marelle | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " 26d1e9a4acad18b9cf79244334c86c93 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 20371 | \n",
- " 2023-09-13 03:42:45.218728+02:00 | \n",
- " 2023-09-13 03:54:30.103943+02:00 | \n",
- " 1865 | \n",
- " 1054 | \n",
- " dialogues | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " 60356fc5e8ed6c9c1be9c5ec67e77766 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 20570 | \n",
- " 2023-10-05 04:48:29.374504+02:00 | \n",
- " 2023-10-05 04:48:36.562528+02:00 | \n",
- " 1865 | \n",
- " 1054 | \n",
- " les grandes epopees | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " f8ab088e06252bf34e1b12ad2ce1a403 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 20757 | \n",
- " 2023-11-01 03:55:20.846196+01:00 | \n",
- " 2023-11-01 03:55:28.412457+01:00 | \n",
- " 1865 | \n",
- " 1054 | \n",
- " scolaire marelle | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " 447fa80f9a793b7587bb85ebbda6442c | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 20364 | \n",
- " 2023-09-13 03:42:45.196791+02:00 | \n",
- " 2023-09-13 03:54:30.075456+02:00 | \n",
- " 1865 | \n",
- " 1054 | \n",
- " le couronnement de poppee | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " 3b37f5d2cd354cbc422868621ac7ebc2 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 398 | \n",
- " 15603 | \n",
- " 2023-09-12 17:42:25.327618+02:00 | \n",
- " 2023-09-12 19:00:00.893400+02:00 | \n",
- " 1706 | \n",
- " 1054 | \n",
- " marelle | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " fde88b72fb82b1fe42fbbfbfc3d6b4d3 | \n",
- "
\n",
- " \n",
- " 399 | \n",
- " 15621 | \n",
- " 2023-09-12 17:42:25.335792+02:00 | \n",
- " 2023-09-12 19:00:00.899622+02:00 | \n",
- " 1708 | \n",
- " 1054 | \n",
- " cartes d'adhesion | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " 051b96aad2b720bad4450a59ed7dfbf6 | \n",
- "
\n",
- " \n",
- " 400 | \n",
- " 15740 | \n",
- " 2023-09-12 17:47:05.112101+02:00 | \n",
- " 2023-09-12 19:00:00.906123+02:00 | \n",
- " 1711 | \n",
- " 1054 | \n",
- " repetition le medecin malgre lui | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " addd6885bea5ddf60ec3539dfc3e79e8 | \n",
- "
\n",
- " \n",
- " 401 | \n",
- " 15520 | \n",
- " 2023-09-12 17:42:25.290280+02:00 | \n",
- " 2023-09-12 19:00:00.835625+02:00 | \n",
- " 1708 | \n",
- " 1054 | \n",
- " opera au village | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " 94f250d10d4a56358ceab23b384439ff | \n",
- "
\n",
- " \n",
- " 402 | \n",
- " 15439 | \n",
- " 2023-09-12 17:42:25.252747+02:00 | \n",
- " 2023-09-12 19:00:00.735990+02:00 | \n",
- " 1708 | \n",
- " 1054 | \n",
- " florilege | \n",
- " 1055 | \n",
- " False | \n",
- " True | \n",
- " 1055 | \n",
- " 1054 | \n",
- " 4f015946bcbd856aa573cadb7ac42b9f | \n",
- "
\n",
- " \n",
- "
\n",
- "
403 rows × 12 columns
\n",
- "
"
- ],
- "text/plain": [
- " id created_at \\\n",
- "0 20367 2023-09-13 03:42:45.214293+02:00 \n",
- "1 20371 2023-09-13 03:42:45.218728+02:00 \n",
- "2 20570 2023-10-05 04:48:29.374504+02:00 \n",
- "3 20757 2023-11-01 03:55:20.846196+01:00 \n",
- "4 20364 2023-09-13 03:42:45.196791+02:00 \n",
- ".. ... ... \n",
- "398 15603 2023-09-12 17:42:25.327618+02:00 \n",
- "399 15621 2023-09-12 17:42:25.335792+02:00 \n",
- "400 15740 2023-09-12 17:47:05.112101+02:00 \n",
- "401 15520 2023-09-12 17:42:25.290280+02:00 \n",
- "402 15439 2023-09-12 17:42:25.252747+02:00 \n",
- "\n",
- " updated_at season_id facility_id \\\n",
- "0 2023-09-13 03:54:30.086969+02:00 1865 1054 \n",
- "1 2023-09-13 03:54:30.103943+02:00 1865 1054 \n",
- "2 2023-10-05 04:48:36.562528+02:00 1865 1054 \n",
- "3 2023-11-01 03:55:28.412457+01:00 1865 1054 \n",
- "4 2023-09-13 03:54:30.075456+02:00 1865 1054 \n",
- ".. ... ... ... \n",
- "398 2023-09-12 19:00:00.893400+02:00 1706 1054 \n",
- "399 2023-09-12 19:00:00.899622+02:00 1708 1054 \n",
- "400 2023-09-12 19:00:00.906123+02:00 1711 1054 \n",
- "401 2023-09-12 19:00:00.835625+02:00 1708 1054 \n",
- "402 2023-09-12 19:00:00.735990+02:00 1708 1054 \n",
- "\n",
- " name event_type_id manual_added \\\n",
- "0 marelle 1055 False \n",
- "1 dialogues 1055 False \n",
- "2 les grandes epopees 1055 False \n",
- "3 scolaire marelle 1055 False \n",
- "4 le couronnement de poppee 1055 False \n",
- ".. ... ... ... \n",
- "398 marelle 1055 False \n",
- "399 cartes d'adhesion 1055 False \n",
- "400 repetition le medecin malgre lui 1055 False \n",
- "401 opera au village 1055 False \n",
- "402 florilege 1055 False \n",
- "\n",
- " is_display event_type_key_id facility_key_id \\\n",
- "0 True 1055 1054 \n",
- "1 True 1055 1054 \n",
- "2 True 1055 1054 \n",
- "3 True 1055 1054 \n",
- "4 True 1055 1054 \n",
- ".. ... ... ... \n",
- "398 True 1055 1054 \n",
- "399 True 1055 1054 \n",
- "400 True 1055 1054 \n",
- "401 True 1055 1054 \n",
- "402 True 1055 1054 \n",
- "\n",
- " identifier \n",
- "0 26d1e9a4acad18b9cf79244334c86c93 \n",
- "1 60356fc5e8ed6c9c1be9c5ec67e77766 \n",
- "2 f8ab088e06252bf34e1b12ad2ce1a403 \n",
- "3 447fa80f9a793b7587bb85ebbda6442c \n",
- "4 3b37f5d2cd354cbc422868621ac7ebc2 \n",
- ".. ... \n",
- "398 fde88b72fb82b1fe42fbbfbfc3d6b4d3 \n",
- "399 051b96aad2b720bad4450a59ed7dfbf6 \n",
- "400 addd6885bea5ddf60ec3539dfc3e79e8 \n",
- "401 94f250d10d4a56358ceab23b384439ff \n",
- "402 4f015946bcbd856aa573cadb7ac42b9f \n",
- "\n",
- "[403 rows x 12 columns]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "events"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "6cb04679-26e7-4ed8-bfc1-42285da96374",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "357"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "events['name'].nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "c10297e8-a8f9-45f9-8553-17e3fdb6f8c1",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'serial', 'event_id', 'created_at', 'updated_at',\n",
- " 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n",
- " 'is_display', 'representation_type_id', 'expected_filling',\n",
- " 'max_filling', 'extra_field', 'identifier'],\n",
- " dtype='object')\n",
- "(996, 16)\n",
- "\n",
- "RangeIndex: 996 entries, 0 to 995\n",
- "Data columns (total 16 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 996 non-null int64 \n",
- " 1 serial 0 non-null float64\n",
- " 2 event_id 996 non-null int64 \n",
- " 3 created_at 996 non-null object \n",
- " 4 updated_at 996 non-null object \n",
- " 5 start_date_time 996 non-null object \n",
- " 6 open 996 non-null bool \n",
- " 7 satisfaction 0 non-null float64\n",
- " 8 end_date_time 996 non-null object \n",
- " 9 name 0 non-null float64\n",
- " 10 is_display 996 non-null bool \n",
- " 11 representation_type_id 0 non-null float64\n",
- " 12 expected_filling 24 non-null float64\n",
- " 13 max_filling 24 non-null float64\n",
- " 14 extra_field 0 non-null float64\n",
- " 15 identifier 996 non-null object \n",
- "dtypes: bool(2), float64(7), int64(2), object(5)\n",
- "memory usage: 111.0+ KB\n"
- ]
- }
- ],
- "source": [
- "# Représentation des évenements = representations.csv\n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " representations = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(representations.columns)\n",
- "print(representations.shape)\n",
- "representations.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "41ef6a1b-e99e-4c73-a2ae-ba7d438d90c2",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " serial | \n",
- " event_id | \n",
- " created_at | \n",
- " updated_at | \n",
- " start_date_time | \n",
- " open | \n",
- " satisfaction | \n",
- " end_date_time | \n",
- " name | \n",
- " is_display | \n",
- " representation_type_id | \n",
- " expected_filling | \n",
- " max_filling | \n",
- " extra_field | \n",
- " identifier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 44351 | \n",
- " NaN | \n",
- " 20371 | \n",
- " 2023-09-13 03:42:45.245879+02:00 | \n",
- " 2023-09-13 03:42:45.245879+02:00 | \n",
- " 2023-12-21 20:00:00+01:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " 550.0 | \n",
- " 550.0 | \n",
- " NaN | \n",
- " 33520762e8cc28982e3841cbc2be8ce2 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 45497 | \n",
- " NaN | \n",
- " 20757 | \n",
- " 2023-11-01 03:55:20.875712+01:00 | \n",
- " 2023-11-01 03:55:20.875712+01:00 | \n",
- " 2023-11-28 10:00:00+01:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 5c34b84e3d11276e0995d984c94cd28d | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 44383 | \n",
- " NaN | \n",
- " 20383 | \n",
- " 2023-09-13 10:41:08.964302+02:00 | \n",
- " 2023-09-13 10:41:08.964302+02:00 | \n",
- " 2023-06-04 17:00:00+02:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " bf3c65a1dfefbd747dcc2360e6887eac | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 44384 | \n",
- " NaN | \n",
- " 20383 | \n",
- " 2023-09-13 10:41:08.972401+02:00 | \n",
- " 2023-09-13 10:41:08.972401+02:00 | \n",
- " 2023-06-03 17:30:00+02:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " b0e69ae8b78ebab3066aac83de22d239 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 44385 | \n",
- " NaN | \n",
- " 20384 | \n",
- " 2023-09-13 10:41:08.973290+02:00 | \n",
- " 2023-09-13 10:41:08.973290+02:00 | \n",
- " 2023-06-03 16:15:00+02:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 9fb91c8b1cf9e444111c511e212ac5c1 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 991 | \n",
- " 33894 | \n",
- " NaN | \n",
- " 15647 | \n",
- " 2023-09-12 17:42:25.564297+02:00 | \n",
- " 2023-09-12 17:42:25.564297+02:00 | \n",
- " 2022-11-08 20:00:00+01:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 44bbcecfd007ceaad05805391beccabb | \n",
- "
\n",
- " \n",
- " 992 | \n",
- " 33873 | \n",
- " NaN | \n",
- " 15640 | \n",
- " 2023-09-12 17:42:25.554863+02:00 | \n",
- " 2023-09-12 17:42:25.554863+02:00 | \n",
- " 2022-11-14 20:00:00+01:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 151edbec8e0a3cd80071038e857f3493 | \n",
- "
\n",
- " \n",
- " 993 | \n",
- " 33610 | \n",
- " NaN | \n",
- " 15520 | \n",
- " 2023-09-12 17:42:25.442979+02:00 | \n",
- " 2023-09-12 17:42:25.442979+02:00 | \n",
- " 2023-06-19 18:00:00+02:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 9e9e38d527427e1b6f67e0c3f12b82fc | \n",
- "
\n",
- " \n",
- " 994 | \n",
- " 33953 | \n",
- " NaN | \n",
- " 15520 | \n",
- " 2023-09-12 17:42:25.590746+02:00 | \n",
- " 2023-09-12 17:42:25.590746+02:00 | \n",
- " 2023-06-19 20:00:00+02:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 7bf0978aabb6cac1bb4cd2784afb2b6b | \n",
- "
\n",
- " \n",
- " 995 | \n",
- " 33639 | \n",
- " NaN | \n",
- " 15533 | \n",
- " 2023-09-12 17:42:25.455708+02:00 | \n",
- " 2023-09-12 17:42:25.455708+02:00 | \n",
- " 2023-04-15 17:30:00+02:00 | \n",
- " True | \n",
- " NaN | \n",
- " 1901-01-01 00:09:21+00:09 | \n",
- " NaN | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " fae68f1e09710ec8747957af6e22f61d | \n",
- "
\n",
- " \n",
- "
\n",
- "
996 rows × 16 columns
\n",
- "
"
- ],
- "text/plain": [
- " id serial event_id created_at \\\n",
- "0 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n",
- "1 45497 NaN 20757 2023-11-01 03:55:20.875712+01:00 \n",
- "2 44383 NaN 20383 2023-09-13 10:41:08.964302+02:00 \n",
- "3 44384 NaN 20383 2023-09-13 10:41:08.972401+02:00 \n",
- "4 44385 NaN 20384 2023-09-13 10:41:08.973290+02:00 \n",
- ".. ... ... ... ... \n",
- "991 33894 NaN 15647 2023-09-12 17:42:25.564297+02:00 \n",
- "992 33873 NaN 15640 2023-09-12 17:42:25.554863+02:00 \n",
- "993 33610 NaN 15520 2023-09-12 17:42:25.442979+02:00 \n",
- "994 33953 NaN 15520 2023-09-12 17:42:25.590746+02:00 \n",
- "995 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n",
- "\n",
- " updated_at start_date_time open \\\n",
- "0 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n",
- "1 2023-11-01 03:55:20.875712+01:00 2023-11-28 10:00:00+01:00 True \n",
- "2 2023-09-13 10:41:08.964302+02:00 2023-06-04 17:00:00+02:00 True \n",
- "3 2023-09-13 10:41:08.972401+02:00 2023-06-03 17:30:00+02:00 True \n",
- "4 2023-09-13 10:41:08.973290+02:00 2023-06-03 16:15:00+02:00 True \n",
- ".. ... ... ... \n",
- "991 2023-09-12 17:42:25.564297+02:00 2022-11-08 20:00:00+01:00 True \n",
- "992 2023-09-12 17:42:25.554863+02:00 2022-11-14 20:00:00+01:00 True \n",
- "993 2023-09-12 17:42:25.442979+02:00 2023-06-19 18:00:00+02:00 True \n",
- "994 2023-09-12 17:42:25.590746+02:00 2023-06-19 20:00:00+02:00 True \n",
- "995 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n",
- "\n",
- " satisfaction end_date_time name is_display \\\n",
- "0 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- "1 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- "2 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- "3 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- "4 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- ".. ... ... ... ... \n",
- "991 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- "992 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- "993 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- "994 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- "995 NaN 1901-01-01 00:09:21+00:09 NaN True \n",
- "\n",
- " representation_type_id expected_filling max_filling extra_field \\\n",
- "0 NaN 550.0 550.0 NaN \n",
- "1 NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN \n",
- ".. ... ... ... ... \n",
- "991 NaN NaN NaN NaN \n",
- "992 NaN NaN NaN NaN \n",
- "993 NaN NaN NaN NaN \n",
- "994 NaN NaN NaN NaN \n",
- "995 NaN NaN NaN NaN \n",
- "\n",
- " identifier \n",
- "0 33520762e8cc28982e3841cbc2be8ce2 \n",
- "1 5c34b84e3d11276e0995d984c94cd28d \n",
- "2 bf3c65a1dfefbd747dcc2360e6887eac \n",
- "3 b0e69ae8b78ebab3066aac83de22d239 \n",
- "4 9fb91c8b1cf9e444111c511e212ac5c1 \n",
- ".. ... \n",
- "991 44bbcecfd007ceaad05805391beccabb \n",
- "992 151edbec8e0a3cd80071038e857f3493 \n",
- "993 9e9e38d527427e1b6f67e0c3f12b82fc \n",
- "994 7bf0978aabb6cac1bb4cd2784afb2b6b \n",
- "995 fae68f1e09710ec8747957af6e22f61d \n",
- "\n",
- "[996 rows x 16 columns]"
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "representations"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "ae6cdad3-2184-4ae7-928c-2f8bd7769a5b",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'amount', 'is_full_price', 'representation_id',\n",
- " 'pricing_formula_id', 'created_at', 'updated_at', 'category_id',\n",
- " 'apply_price', 'products_group_id', 'product_pack_id', 'extra_field',\n",
- " 'amount_consumption', 'identifier'],\n",
- " dtype='object')\n",
- "(14648, 14)\n",
- "\n",
- "RangeIndex: 14648 entries, 0 to 14647\n",
- "Data columns (total 14 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 14648 non-null int64 \n",
- " 1 amount 14648 non-null float64\n",
- " 2 is_full_price 14648 non-null bool \n",
- " 3 representation_id 14648 non-null int64 \n",
- " 4 pricing_formula_id 14648 non-null int64 \n",
- " 5 created_at 14648 non-null object \n",
- " 6 updated_at 14648 non-null object \n",
- " 7 category_id 14648 non-null int64 \n",
- " 8 apply_price 14648 non-null float64\n",
- " 9 products_group_id 14648 non-null int64 \n",
- " 10 product_pack_id 14648 non-null int64 \n",
- " 11 extra_field 0 non-null float64\n",
- " 12 amount_consumption 0 non-null float64\n",
- " 13 identifier 14648 non-null object \n",
- "dtypes: bool(1), float64(4), int64(6), object(3)\n",
- "memory usage: 1.5+ MB\n"
- ]
- }
- ],
- "source": [
- "# Produits vendues = products.csv\n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " products = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(products.columns)\n",
- "print(products.shape)\n",
- "products.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "34f1825d-148a-4a6e-88d6-61449fee3ee4",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " amount | \n",
- " is_full_price | \n",
- " representation_id | \n",
- " pricing_formula_id | \n",
- " created_at | \n",
- " updated_at | \n",
- " category_id | \n",
- " apply_price | \n",
- " products_group_id | \n",
- " product_pack_id | \n",
- " extra_field | \n",
- " amount_consumption | \n",
- " identifier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 268325 | \n",
- " 18.0 | \n",
- " False | \n",
- " 44332 | \n",
- " 20477 | \n",
- " 2023-09-13 03:42:45.415594+02:00 | \n",
- " 2023-09-13 03:42:45.415594+02:00 | \n",
- " 4972 | \n",
- " 0.0 | \n",
- " 268108 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " b823bbea3ba837da2ef8efaf1287272d | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 274118 | \n",
- " 36.8 | \n",
- " False | \n",
- " 44340 | \n",
- " 20502 | \n",
- " 2023-10-25 03:26:57.430694+02:00 | \n",
- " 2023-10-25 03:26:57.430694+02:00 | \n",
- " 4969 | \n",
- " 0.0 | \n",
- " 273901 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 81e8b7991f6948e3ef7cfe5011d13532 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 268338 | \n",
- " 39.1 | \n",
- " False | \n",
- " 44340 | \n",
- " 20497 | \n",
- " 2023-09-13 03:42:45.430942+02:00 | \n",
- " 2023-09-13 03:42:45.430942+02:00 | \n",
- " 4969 | \n",
- " 0.0 | \n",
- " 268121 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " be8bc0399db4d04aefa9f44afd4d5efa | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 209883 | \n",
- " 0.0 | \n",
- " False | \n",
- " 33443 | \n",
- " 20475 | \n",
- " 2023-09-12 17:42:27.595998+02:00 | \n",
- " 2023-09-12 17:42:27.595998+02:00 | \n",
- " 4970 | \n",
- " 0.0 | \n",
- " 209706 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 01a9eea5f8ad53491faa864bfac44183 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 268326 | \n",
- " 63.0 | \n",
- " False | \n",
- " 44333 | \n",
- " 20477 | \n",
- " 2023-09-13 03:42:45.417283+02:00 | \n",
- " 2023-09-13 03:42:45.417283+02:00 | \n",
- " 4969 | \n",
- " 0.0 | \n",
- " 268109 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 781a917ecfdabb14169701d7b143bbe4 | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 14643 | \n",
- " 217878 | \n",
- " 33.6 | \n",
- " False | \n",
- " 33919 | \n",
- " 20489 | \n",
- " 2023-09-12 17:51:11.572882+02:00 | \n",
- " 2023-09-12 17:51:11.572882+02:00 | \n",
- " 4971 | \n",
- " 0.0 | \n",
- " 217695 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 82bba69321466069411b3023343b44a4 | \n",
- "
\n",
- " \n",
- " 14644 | \n",
- " 268315 | \n",
- " 10.0 | \n",
- " False | \n",
- " 33919 | \n",
- " 20504 | \n",
- " 2023-09-12 18:59:29.995176+02:00 | \n",
- " 2023-09-12 18:59:29.995176+02:00 | \n",
- " 4969 | \n",
- " 0.0 | \n",
- " 268098 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " eae56a8eb0a4315c5713b2053103d595 | \n",
- "
\n",
- " \n",
- " 14645 | \n",
- " 210148 | \n",
- " 5.0 | \n",
- " False | \n",
- " 33531 | \n",
- " 20473 | \n",
- " 2023-09-12 17:42:27.733260+02:00 | \n",
- " 2023-09-12 17:42:27.733260+02:00 | \n",
- " 4975 | \n",
- " 0.0 | \n",
- " 209971 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 449f86c1ef2b478d3389f7d0e27d0e6b | \n",
- "
\n",
- " \n",
- " 14646 | \n",
- " 212054 | \n",
- " 30.0 | \n",
- " False | \n",
- " 33810 | \n",
- " 20473 | \n",
- " 2023-09-12 17:42:28.724681+02:00 | \n",
- " 2023-09-12 17:42:28.724681+02:00 | \n",
- " 4972 | \n",
- " 0.0 | \n",
- " 211876 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 2090203e2c0b58ea8f505089faee6d62 | \n",
- "
\n",
- " \n",
- " 14647 | \n",
- " 261922 | \n",
- " 21.0 | \n",
- " False | \n",
- " 33766 | \n",
- " 20488 | \n",
- " 2023-09-12 18:52:00.519838+02:00 | \n",
- " 2023-09-12 18:52:00.519838+02:00 | \n",
- " 4972 | \n",
- " 0.0 | \n",
- " 261709 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 9139ee36a92bed766ae95372cca77336 | \n",
- "
\n",
- " \n",
- "
\n",
- "
14648 rows × 14 columns
\n",
- "
"
- ],
- "text/plain": [
- " id amount is_full_price representation_id pricing_formula_id \\\n",
- "0 268325 18.0 False 44332 20477 \n",
- "1 274118 36.8 False 44340 20502 \n",
- "2 268338 39.1 False 44340 20497 \n",
- "3 209883 0.0 False 33443 20475 \n",
- "4 268326 63.0 False 44333 20477 \n",
- "... ... ... ... ... ... \n",
- "14643 217878 33.6 False 33919 20489 \n",
- "14644 268315 10.0 False 33919 20504 \n",
- "14645 210148 5.0 False 33531 20473 \n",
- "14646 212054 30.0 False 33810 20473 \n",
- "14647 261922 21.0 False 33766 20488 \n",
- "\n",
- " created_at updated_at \\\n",
- "0 2023-09-13 03:42:45.415594+02:00 2023-09-13 03:42:45.415594+02:00 \n",
- "1 2023-10-25 03:26:57.430694+02:00 2023-10-25 03:26:57.430694+02:00 \n",
- "2 2023-09-13 03:42:45.430942+02:00 2023-09-13 03:42:45.430942+02:00 \n",
- "3 2023-09-12 17:42:27.595998+02:00 2023-09-12 17:42:27.595998+02:00 \n",
- "4 2023-09-13 03:42:45.417283+02:00 2023-09-13 03:42:45.417283+02:00 \n",
- "... ... ... \n",
- "14643 2023-09-12 17:51:11.572882+02:00 2023-09-12 17:51:11.572882+02:00 \n",
- "14644 2023-09-12 18:59:29.995176+02:00 2023-09-12 18:59:29.995176+02:00 \n",
- "14645 2023-09-12 17:42:27.733260+02:00 2023-09-12 17:42:27.733260+02:00 \n",
- "14646 2023-09-12 17:42:28.724681+02:00 2023-09-12 17:42:28.724681+02:00 \n",
- "14647 2023-09-12 18:52:00.519838+02:00 2023-09-12 18:52:00.519838+02:00 \n",
- "\n",
- " category_id apply_price products_group_id product_pack_id \\\n",
- "0 4972 0.0 268108 1 \n",
- "1 4969 0.0 273901 1 \n",
- "2 4969 0.0 268121 1 \n",
- "3 4970 0.0 209706 1 \n",
- "4 4969 0.0 268109 1 \n",
- "... ... ... ... ... \n",
- "14643 4971 0.0 217695 1 \n",
- "14644 4969 0.0 268098 1 \n",
- "14645 4975 0.0 209971 1 \n",
- "14646 4972 0.0 211876 1 \n",
- "14647 4972 0.0 261709 1 \n",
- "\n",
- " extra_field amount_consumption identifier \n",
- "0 NaN NaN b823bbea3ba837da2ef8efaf1287272d \n",
- "1 NaN NaN 81e8b7991f6948e3ef7cfe5011d13532 \n",
- "2 NaN NaN be8bc0399db4d04aefa9f44afd4d5efa \n",
- "3 NaN NaN 01a9eea5f8ad53491faa864bfac44183 \n",
- "4 NaN NaN 781a917ecfdabb14169701d7b143bbe4 \n",
- "... ... ... ... \n",
- "14643 NaN NaN 82bba69321466069411b3023343b44a4 \n",
- "14644 NaN NaN eae56a8eb0a4315c5713b2053103d595 \n",
- "14645 NaN NaN 449f86c1ef2b478d3389f7d0e27d0e6b \n",
- "14646 NaN NaN 2090203e2c0b58ea8f505089faee6d62 \n",
- "14647 NaN NaN 9139ee36a92bed766ae95372cca77336 \n",
- "\n",
- "[14648 rows x 14 columns]"
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "products"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "id": "6735b338-26b5-479d-825d-677ea533dad5",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'name', 'created_at', 'updated_at', 'street_id', 'fixed_capacity',\n",
- " 'identifier'],\n",
- " dtype='object')\n",
- "(1, 7)\n",
- "\n",
- "RangeIndex: 1 entries, 0 to 0\n",
- "Data columns (total 7 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 1 non-null int64 \n",
- " 1 name 0 non-null float64\n",
- " 2 created_at 1 non-null object \n",
- " 3 updated_at 1 non-null object \n",
- " 4 street_id 1 non-null int64 \n",
- " 5 fixed_capacity 0 non-null float64\n",
- " 6 identifier 1 non-null object \n",
- "dtypes: float64(2), int64(2), object(3)\n",
- "memory usage: 184.0+ bytes\n"
- ]
- }
- ],
- "source": [
- "# Lieu = facilities.csv\n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " facilities = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(facilities.columns)\n",
- "print(facilities.shape)\n",
- "facilities.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "428b86c2-50f4-42a5-9bbb-a17ffe820bf9",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " name | \n",
- " created_at | \n",
- " updated_at | \n",
- " street_id | \n",
- " fixed_capacity | \n",
- " identifier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 1054 | \n",
- " NaN | \n",
- " 2023-09-12 17:42:25.223064+02:00 | \n",
- " 2023-09-12 17:42:25.223064+02:00 | \n",
- " 1 | \n",
- " NaN | \n",
- " d41d8cd98f00b204e9800998ecf8427e | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id name created_at \\\n",
- "0 1054 NaN 2023-09-12 17:42:25.223064+02:00 \n",
- "\n",
- " updated_at street_id fixed_capacity \\\n",
- "0 2023-09-12 17:42:25.223064+02:00 1 NaN \n",
- "\n",
- " identifier \n",
- "0 d41d8cd98f00b204e9800998ecf8427e "
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "facilities"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "id": "f6b26ad5-a4cc-4219-a0b0-406d9b025458",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'name', 'created_at', 'updated_at', 'start_date_time',\n",
- " 'identifier'],\n",
- " dtype='object')\n",
- "(9, 6)\n",
- "\n",
- "RangeIndex: 9 entries, 0 to 8\n",
- "Data columns (total 6 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 9 non-null int64 \n",
- " 1 name 9 non-null object \n",
- " 2 created_at 9 non-null object \n",
- " 3 updated_at 9 non-null object \n",
- " 4 start_date_time 0 non-null float64\n",
- " 5 identifier 9 non-null object \n",
- "dtypes: float64(1), int64(1), object(4)\n",
- "memory usage: 560.0+ bytes\n"
- ]
- }
- ],
- "source": [
- "# Saisons = seasons.csv période sur deux années consécutives\n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " seasons = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(seasons.columns)\n",
- "print(seasons.shape)\n",
- "seasons.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "id": "75c8c0ef-4ff5-45b1-a791-8ba2e9a4437e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "array(['saison 2023-2024', 'saison 2021-2022', 'saison 2015-2016',\n",
- " 'saison 2016-2017', 'saison 2017-2018', 'saison 2018-2019',\n",
- " 'saison 2020-2021', 'saison 2019-2020', 'saison 2022-2023'],\n",
- " dtype=object)"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "seasons['name'].unique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "cd0d10df-10cc-4f75-8b88-35f676c91f5b",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Index(['id', 'purchase_date', 'customer_id', 'created_at', 'updated_at',\n",
- " 'number', 'identifier'],\n",
- " dtype='object')\n",
- "(410695, 7)\n",
- "\n",
- "RangeIndex: 410695 entries, 0 to 410694\n",
- "Data columns (total 7 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 id 410695 non-null int64 \n",
- " 1 purchase_date 410695 non-null object \n",
- " 2 customer_id 410695 non-null int64 \n",
- " 3 created_at 410695 non-null object \n",
- " 4 updated_at 410695 non-null object \n",
- " 5 number 0 non-null float64\n",
- " 6 identifier 410695 non-null object \n",
- "dtypes: float64(1), int64(2), object(4)\n",
- "memory usage: 21.9+ MB\n"
- ]
- }
- ],
- "source": [
- "# Achats = purchases.csv \n",
- "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " purchases = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "print(purchases.columns)\n",
- "print(purchases.shape)\n",
- "purchases.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "id": "8f986fdb-ca37-4cbb-b526-2a6d0ce7ca2c",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " purchase_date | \n",
- " customer_id | \n",
- " created_at | \n",
- " updated_at | \n",
- " number | \n",
- " identifier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 861761 | \n",
- " 2019-03-01 16:28:49+01:00 | \n",
- " 4966 | \n",
- " 2023-09-12 17:42:37.564150+02:00 | \n",
- " 2023-09-12 17:42:37.564150+02:00 | \n",
- " NaN | \n",
- " d20eb0c3a7efec0bbe338dee40dc3378 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 861762 | \n",
- " 2019-03-01 16:29:11+01:00 | \n",
- " 4966 | \n",
- " 2023-09-12 17:42:37.571159+02:00 | \n",
- " 2023-09-12 17:42:37.571159+02:00 | \n",
- " NaN | \n",
- " cff3abfc018517bce5ccfc58f5cacf40 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 861763 | \n",
- " 2019-03-01 16:29:17+01:00 | \n",
- " 4966 | \n",
- " 2023-09-12 17:42:37.571646+02:00 | \n",
- " 2023-09-12 17:42:37.571646+02:00 | \n",
- " NaN | \n",
- " e1155cf26b34f792bdb23e49244d7264 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 861764 | \n",
- " 2019-03-01 16:29:19+01:00 | \n",
- " 4966 | \n",
- " 2023-09-12 17:42:37.572063+02:00 | \n",
- " 2023-09-12 17:42:37.572063+02:00 | \n",
- " NaN | \n",
- " e8b95cc6a1a8b103ffa39755ce3bfc4d | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 861765 | \n",
- " 2019-03-01 16:32:08+01:00 | \n",
- " 405994 | \n",
- " 2023-09-12 17:42:37.572470+02:00 | \n",
- " 2023-09-12 17:42:37.572470+02:00 | \n",
- " NaN | \n",
- " 1b763278914f1309e357abe5033a3f0f | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 410690 | \n",
- " 1285964 | \n",
- " 2023-10-21 21:46:41+02:00 | \n",
- " 517309 | \n",
- " 2023-10-23 03:43:16.457501+02:00 | \n",
- " 2023-10-23 03:43:16.457501+02:00 | \n",
- " NaN | \n",
- " 72c4e90c2b151dcffc87b19ea8a0c4f1 | \n",
- "
\n",
- " \n",
- " 410691 | \n",
- " 1285965 | \n",
- " 2023-10-21 21:47:07+02:00 | \n",
- " 517309 | \n",
- " 2023-10-23 03:43:16.458458+02:00 | \n",
- " 2023-10-23 03:43:16.458458+02:00 | \n",
- " NaN | \n",
- " ee65532087132145daa6154fbae050ea | \n",
- "
\n",
- " \n",
- " 410692 | \n",
- " 1285966 | \n",
- " 2023-10-21 21:47:20+02:00 | \n",
- " 517309 | \n",
- " 2023-10-23 03:43:16.458811+02:00 | \n",
- " 2023-10-23 03:43:16.458811+02:00 | \n",
- " NaN | \n",
- " 7e825dd352bc6a11ab81cb8068e325e6 | \n",
- "
\n",
- " \n",
- " 410693 | \n",
- " 1285967 | \n",
- " 2023-10-21 23:07:06+02:00 | \n",
- " 399969 | \n",
- " 2023-10-23 03:43:16.459738+02:00 | \n",
- " 2023-10-23 03:43:16.459738+02:00 | \n",
- " NaN | \n",
- " fdb92627a48d6ba8fa817d60a83dbea8 | \n",
- "
\n",
- " \n",
- " 410694 | \n",
- " 1285968 | \n",
- " 2023-10-21 23:07:39+02:00 | \n",
- " 399969 | \n",
- " 2023-10-23 03:43:16.462409+02:00 | \n",
- " 2023-10-23 03:43:16.462409+02:00 | \n",
- " NaN | \n",
- " e9dbaff4f7037a5b0efa11263584dfad | \n",
- "
\n",
- " \n",
- "
\n",
- "
410695 rows × 7 columns
\n",
- "
"
- ],
- "text/plain": [
- " id purchase_date customer_id \\\n",
- "0 861761 2019-03-01 16:28:49+01:00 4966 \n",
- "1 861762 2019-03-01 16:29:11+01:00 4966 \n",
- "2 861763 2019-03-01 16:29:17+01:00 4966 \n",
- "3 861764 2019-03-01 16:29:19+01:00 4966 \n",
- "4 861765 2019-03-01 16:32:08+01:00 405994 \n",
- "... ... ... ... \n",
- "410690 1285964 2023-10-21 21:46:41+02:00 517309 \n",
- "410691 1285965 2023-10-21 21:47:07+02:00 517309 \n",
- "410692 1285966 2023-10-21 21:47:20+02:00 517309 \n",
- "410693 1285967 2023-10-21 23:07:06+02:00 399969 \n",
- "410694 1285968 2023-10-21 23:07:39+02:00 399969 \n",
- "\n",
- " created_at updated_at \\\n",
- "0 2023-09-12 17:42:37.564150+02:00 2023-09-12 17:42:37.564150+02:00 \n",
- "1 2023-09-12 17:42:37.571159+02:00 2023-09-12 17:42:37.571159+02:00 \n",
- "2 2023-09-12 17:42:37.571646+02:00 2023-09-12 17:42:37.571646+02:00 \n",
- "3 2023-09-12 17:42:37.572063+02:00 2023-09-12 17:42:37.572063+02:00 \n",
- "4 2023-09-12 17:42:37.572470+02:00 2023-09-12 17:42:37.572470+02:00 \n",
- "... ... ... \n",
- "410690 2023-10-23 03:43:16.457501+02:00 2023-10-23 03:43:16.457501+02:00 \n",
- "410691 2023-10-23 03:43:16.458458+02:00 2023-10-23 03:43:16.458458+02:00 \n",
- "410692 2023-10-23 03:43:16.458811+02:00 2023-10-23 03:43:16.458811+02:00 \n",
- "410693 2023-10-23 03:43:16.459738+02:00 2023-10-23 03:43:16.459738+02:00 \n",
- "410694 2023-10-23 03:43:16.462409+02:00 2023-10-23 03:43:16.462409+02:00 \n",
- "\n",
- " number identifier \n",
- "0 NaN d20eb0c3a7efec0bbe338dee40dc3378 \n",
- "1 NaN cff3abfc018517bce5ccfc58f5cacf40 \n",
- "2 NaN e1155cf26b34f792bdb23e49244d7264 \n",
- "3 NaN e8b95cc6a1a8b103ffa39755ce3bfc4d \n",
- "4 NaN 1b763278914f1309e357abe5033a3f0f \n",
- "... ... ... \n",
- "410690 NaN 72c4e90c2b151dcffc87b19ea8a0c4f1 \n",
- "410691 NaN ee65532087132145daa6154fbae050ea \n",
- "410692 NaN 7e825dd352bc6a11ab81cb8068e325e6 \n",
- "410693 NaN fdb92627a48d6ba8fa817d60a83dbea8 \n",
- "410694 NaN e9dbaff4f7037a5b0efa11263584dfad \n",
- "\n",
- "[410695 rows x 7 columns]"
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "purchases"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb
new file mode 100644
index 0000000..e1802cd
--- /dev/null
+++ b/Exploration_billet_AJ.ipynb
@@ -0,0 +1,3406 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "56b3d44e-1e3f-4726-9916-0f9af107860e",
+ "metadata": {},
+ "source": [
+ "# Business Data Challenge - Team 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "15103481-8d74-404c-aa09-7601fe7730da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import os\n",
+ "import s3fs\n",
+ "import re"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e",
+ "metadata": {},
+ "source": [
+ "Configuration de l'accès aux données"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create filesystem object\n",
+ "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
+ "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
+ "metadata": {},
+ "source": [
+ "# Exemple sur Company 1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9d74b68f-ba07-4a15-9a27-dae931762d70",
+ "metadata": {},
+ "source": [
+ "## Chargement données"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "699664b9-eee4-4f8d-a207-e524526560c5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "BUCKET = \"bdc2324-data/1\"\n",
+ "liste_database = fs.ls(BUCKET)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "aaf64d60-bf92-470c-8210-d09abd6a653e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['bdc2324-data/1/1campaign_stats.csv',\n",
+ " 'bdc2324-data/1/1campaigns.csv',\n",
+ " 'bdc2324-data/1/1categories.csv',\n",
+ " 'bdc2324-data/1/1countries.csv',\n",
+ " 'bdc2324-data/1/1currencies.csv',\n",
+ " 'bdc2324-data/1/1customer_target_mappings.csv',\n",
+ " 'bdc2324-data/1/1customersplus.csv',\n",
+ " 'bdc2324-data/1/1event_types.csv',\n",
+ " 'bdc2324-data/1/1events.csv',\n",
+ " 'bdc2324-data/1/1facilities.csv',\n",
+ " 'bdc2324-data/1/1link_stats.csv',\n",
+ " 'bdc2324-data/1/1pricing_formulas.csv',\n",
+ " 'bdc2324-data/1/1product_packs.csv',\n",
+ " 'bdc2324-data/1/1products.csv',\n",
+ " 'bdc2324-data/1/1products_groups.csv',\n",
+ " 'bdc2324-data/1/1purchases.csv',\n",
+ " 'bdc2324-data/1/1representation_category_capacities.csv',\n",
+ " 'bdc2324-data/1/1representations.csv',\n",
+ " 'bdc2324-data/1/1seasons.csv',\n",
+ " 'bdc2324-data/1/1structure_tag_mappings.csv',\n",
+ " 'bdc2324-data/1/1suppliers.csv',\n",
+ " 'bdc2324-data/1/1tags.csv',\n",
+ " 'bdc2324-data/1/1target_types.csv',\n",
+ " 'bdc2324-data/1/1targets.csv',\n",
+ " 'bdc2324-data/1/1tickets.csv',\n",
+ " 'bdc2324-data/1/1type_of_categories.csv',\n",
+ " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n",
+ " 'bdc2324-data/1/1type_ofs.csv']"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "liste_database"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "0cb92854-903b-4efd-ac1b-197e29f044b4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1tickets.csv', 'bdc2324-data/1/1type_ofs.csv']\n"
+ ]
+ }
+ ],
+ "source": [
+ "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
+ "\n",
+ "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
+ "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
+ "\n",
+ "# Afficher le résultat\n",
+ "print(liste_database_filtered)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_9792/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " df = pd.read_csv(file_in)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# loop to create dataframes from liste\n",
+ "files_path = liste_database\n",
+ "\n",
+ "client_number = files_path[0].split(\"/\")[1]\n",
+ "df_prefix = \"df\" + str(client_number) + \"_\"\n",
+ "\n",
+ "for i in range(len(files_path)) :\n",
+ " current_path = files_path[i]\n",
+ " with fs.open(current_path, mode=\"rb\") as file_in:\n",
+ " df = pd.read_csv(file_in)\n",
+ " # the pattern of the name is df1xxx\n",
+ " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
+ " globals()[nom_dataframe] = df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "## tickets.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " number | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " purchase_id | \n",
+ " product_id | \n",
+ " is_from_subscription | \n",
+ " type_of | \n",
+ " supplier_id | \n",
+ " barcode | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 13070859 | \n",
+ " 13593002661288 | \n",
+ " 2021-12-28 20:47:10.320641+01:00 | \n",
+ " 2022-02-14 18:46:53.614229+01:00 | \n",
+ " 5107462 | \n",
+ " 225251 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " b6ad7fc36f33b5e05f58c7fca06688a6 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 13070860 | \n",
+ " 13593002661399 | \n",
+ " 2021-12-28 20:47:10.321037+01:00 | \n",
+ " 2022-02-14 18:46:53.614761+01:00 | \n",
+ " 5107462 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " b0903af480266f27802fe5c38c277c9e | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13070861 | \n",
+ " 13593002661419 | \n",
+ " 2021-12-28 20:47:10.321629+01:00 | \n",
+ " 2022-02-14 18:46:53.615521+01:00 | \n",
+ " 5107462 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 64ca12b7e26a65b90335c0702ea0faba | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 13070862 | \n",
+ " 13593002661508 | \n",
+ " 2021-12-28 20:47:10.322029+01:00 | \n",
+ " 2022-02-14 18:46:53.616000+01:00 | \n",
+ " 5107462 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 5ac2f8150aa9f3a6b1599df08cc2f0c7 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 13070863 | \n",
+ " 13593002661689 | \n",
+ " 2021-12-28 20:47:10.322449+01:00 | \n",
+ " 2022-02-14 18:46:53.616447+01:00 | \n",
+ " 5107462 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " dfe30081bae020d12094279926136b9c | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1826667 | \n",
+ " 20662815 | \n",
+ " 13593016154390 | \n",
+ " 2023-11-09 07:51:34.935983+01:00 | \n",
+ " 2023-11-09 07:51:34.935983+01:00 | \n",
+ " 8007697 | \n",
+ " 405689 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " dba9aa428f843b79ae69dfacfe8fc579 | \n",
+ "
\n",
+ " \n",
+ " 1826668 | \n",
+ " 20662816 | \n",
+ " 13593016154501 | \n",
+ " 2023-11-09 07:51:34.937038+01:00 | \n",
+ " 2023-11-09 07:51:34.937038+01:00 | \n",
+ " 8007698 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 93f1fcfc6ba4fa68f92eb4b4a619fcf0 | \n",
+ "
\n",
+ " \n",
+ " 1826669 | \n",
+ " 20662817 | \n",
+ " 13593016154680 | \n",
+ " 2023-11-09 07:51:34.938224+01:00 | \n",
+ " 2023-11-09 07:51:34.938224+01:00 | \n",
+ " 8007698 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " c8bbbd25df2c158767ceef42c3237f23 | \n",
+ "
\n",
+ " \n",
+ " 1826670 | \n",
+ " 20662818 | \n",
+ " 13593016154899 | \n",
+ " 2023-11-09 07:51:34.939328+01:00 | \n",
+ " 2023-11-09 07:51:34.939328+01:00 | \n",
+ " 8007699 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 738f0a8b5088b5056bc3b32eff2dca1f | \n",
+ "
\n",
+ " \n",
+ " 1826671 | \n",
+ " 20662819 | \n",
+ " 13593016154988 | \n",
+ " 2023-11-09 07:51:34.940680+01:00 | \n",
+ " 2023-11-09 07:51:34.940680+01:00 | \n",
+ " 8007699 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " NaN | \n",
+ " 4c5a6195434377380b4e6ae63b2e9cf6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1826672 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id number created_at \\\n",
+ "0 13070859 13593002661288 2021-12-28 20:47:10.320641+01:00 \n",
+ "1 13070860 13593002661399 2021-12-28 20:47:10.321037+01:00 \n",
+ "2 13070861 13593002661419 2021-12-28 20:47:10.321629+01:00 \n",
+ "3 13070862 13593002661508 2021-12-28 20:47:10.322029+01:00 \n",
+ "4 13070863 13593002661689 2021-12-28 20:47:10.322449+01:00 \n",
+ "... ... ... ... \n",
+ "1826667 20662815 13593016154390 2023-11-09 07:51:34.935983+01:00 \n",
+ "1826668 20662816 13593016154501 2023-11-09 07:51:34.937038+01:00 \n",
+ "1826669 20662817 13593016154680 2023-11-09 07:51:34.938224+01:00 \n",
+ "1826670 20662818 13593016154899 2023-11-09 07:51:34.939328+01:00 \n",
+ "1826671 20662819 13593016154988 2023-11-09 07:51:34.940680+01:00 \n",
+ "\n",
+ " updated_at purchase_id product_id \\\n",
+ "0 2022-02-14 18:46:53.614229+01:00 5107462 225251 \n",
+ "1 2022-02-14 18:46:53.614761+01:00 5107462 224914 \n",
+ "2 2022-02-14 18:46:53.615521+01:00 5107462 224914 \n",
+ "3 2022-02-14 18:46:53.616000+01:00 5107462 224914 \n",
+ "4 2022-02-14 18:46:53.616447+01:00 5107462 224914 \n",
+ "... ... ... ... \n",
+ "1826667 2023-11-09 07:51:34.935983+01:00 8007697 405689 \n",
+ "1826668 2023-11-09 07:51:34.937038+01:00 8007698 403658 \n",
+ "1826669 2023-11-09 07:51:34.938224+01:00 8007698 403658 \n",
+ "1826670 2023-11-09 07:51:34.939328+01:00 8007699 403658 \n",
+ "1826671 2023-11-09 07:51:34.940680+01:00 8007699 403658 \n",
+ "\n",
+ " is_from_subscription type_of supplier_id barcode \\\n",
+ "0 False 1 3 NaN \n",
+ "1 False 1 3 NaN \n",
+ "2 False 1 3 NaN \n",
+ "3 False 1 3 NaN \n",
+ "4 False 1 3 NaN \n",
+ "... ... ... ... ... \n",
+ "1826667 False 1 3 NaN \n",
+ "1826668 False 1 3 NaN \n",
+ "1826669 False 1 3 NaN \n",
+ "1826670 False 1 3 NaN \n",
+ "1826671 False 1 3 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 b6ad7fc36f33b5e05f58c7fca06688a6 \n",
+ "1 b0903af480266f27802fe5c38c277c9e \n",
+ "2 64ca12b7e26a65b90335c0702ea0faba \n",
+ "3 5ac2f8150aa9f3a6b1599df08cc2f0c7 \n",
+ "4 dfe30081bae020d12094279926136b9c \n",
+ "... ... \n",
+ "1826667 dba9aa428f843b79ae69dfacfe8fc579 \n",
+ "1826668 93f1fcfc6ba4fa68f92eb4b4a619fcf0 \n",
+ "1826669 c8bbbd25df2c158767ceef42c3237f23 \n",
+ "1826670 738f0a8b5088b5056bc3b32eff2dca1f \n",
+ "1826671 4c5a6195434377380b4e6ae63b2e9cf6 \n",
+ "\n",
+ "[1826672 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_tickets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 1826672 entries, 0 to 1826671\n",
+ "Data columns (total 11 columns):\n",
+ " # Column Dtype \n",
+ "--- ------ ----- \n",
+ " 0 id int64 \n",
+ " 1 number object \n",
+ " 2 created_at object \n",
+ " 3 updated_at object \n",
+ " 4 purchase_id int64 \n",
+ " 5 product_id int64 \n",
+ " 6 is_from_subscription bool \n",
+ " 7 type_of int64 \n",
+ " 8 supplier_id int64 \n",
+ " 9 barcode float64\n",
+ " 10 identifier object \n",
+ "dtypes: bool(1), float64(1), int64(5), object(4)\n",
+ "memory usage: 141.1+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_tickets.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "c1b42769-03c7-4785-92ce-5e1e6b41908d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0.0\n",
+ "number 0.0\n",
+ "created_at 0.0\n",
+ "updated_at 0.0\n",
+ "purchase_id 0.0\n",
+ "product_id 0.0\n",
+ "is_from_subscription 0.0\n",
+ "type_of 0.0\n",
+ "supplier_id 0.0\n",
+ "barcode 100.0\n",
+ "identifier 0.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_tickets.isna().sum()/len(df1_tickets)*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "42896791-2d93-4725-a50b-6c7cbe535ec7",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_619/232847087.py:3: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Selection des variables\n",
+ "df1_tickets_clean = df1_tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n",
+ "df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "## suppliers.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "2e0dada0-9457-484c-aa55-77e44613ecca",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " manually_added | \n",
+ " label | \n",
+ " itr | \n",
+ " updated_at | \n",
+ " created_at | \n",
+ " commission | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1617 | \n",
+ " j4 administration | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2021-07-29 09:21:37.325772+02:00 | \n",
+ " 2021-07-29 09:21:37.325772+02:00 | \n",
+ " NaN | \n",
+ " 5958b2a060ac3e31678b438892a1bd2e | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 8 | \n",
+ " non défini | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:16:35.329062+02:00 | \n",
+ " 2020-09-03 13:16:35.329062+02:00 | \n",
+ " NaN | \n",
+ " 52ff3466787b4d538407372e5f7afe0f | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4 | \n",
+ " vad | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.896992+02:00 | \n",
+ " 2020-09-03 13:11:23.896992+02:00 | \n",
+ " NaN | \n",
+ " 1225483c97b36018cab2bea14ab78ea6 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " fort saint jean | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.833073+02:00 | \n",
+ " 2020-09-03 13:11:23.833073+02:00 | \n",
+ " NaN | \n",
+ " 001b9b4a524fe407150b8235b304d4ec | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2 | \n",
+ " j4 | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.888993+02:00 | \n",
+ " 2020-09-03 13:11:23.888993+02:00 | \n",
+ " NaN | \n",
+ " 6a0cf6edf20060344b465706b61719aa | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 5 | \n",
+ " revendeur | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.900987+02:00 | \n",
+ " 2020-09-03 13:11:23.900987+02:00 | \n",
+ " NaN | \n",
+ " 931239d4acb6214d7e5c98edecfb4916 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 3 | \n",
+ " vente en ligne | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.893097+02:00 | \n",
+ " 2020-09-03 13:11:23.893097+02:00 | \n",
+ " NaN | \n",
+ " bde8f2ccff510df8572d3214d86b837d | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 6 | \n",
+ " ccr | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.904974+02:00 | \n",
+ " 2020-09-03 13:11:23.904974+02:00 | \n",
+ " NaN | \n",
+ " b48ec279411f7dbbb68393c61a9724d9 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 7 | \n",
+ " dab | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.908970+02:00 | \n",
+ " 2020-09-03 13:11:23.908970+02:00 | \n",
+ " NaN | \n",
+ " 11c6d471fa4e354e62e684d293694202 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name manually_added label itr \\\n",
+ "0 1617 j4 administration False NaN NaN \n",
+ "1 8 non défini False NaN NaN \n",
+ "2 4 vad False NaN NaN \n",
+ "3 1 fort saint jean False NaN NaN \n",
+ "4 2 j4 False NaN NaN \n",
+ "5 5 revendeur False NaN NaN \n",
+ "6 3 vente en ligne False NaN NaN \n",
+ "7 6 ccr False NaN NaN \n",
+ "8 7 dab False NaN NaN \n",
+ "\n",
+ " updated_at created_at \\\n",
+ "0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n",
+ "1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n",
+ "2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n",
+ "3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n",
+ "4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n",
+ "5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n",
+ "6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n",
+ "7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n",
+ "8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n",
+ "\n",
+ " commission identifier \n",
+ "0 NaN 5958b2a060ac3e31678b438892a1bd2e \n",
+ "1 NaN 52ff3466787b4d538407372e5f7afe0f \n",
+ "2 NaN 1225483c97b36018cab2bea14ab78ea6 \n",
+ "3 NaN 001b9b4a524fe407150b8235b304d4ec \n",
+ "4 NaN 6a0cf6edf20060344b465706b61719aa \n",
+ "5 NaN 931239d4acb6214d7e5c98edecfb4916 \n",
+ "6 NaN bde8f2ccff510df8572d3214d86b837d \n",
+ "7 NaN b48ec279411f7dbbb68393c61a9724d9 \n",
+ "8 NaN 11c6d471fa4e354e62e684d293694202 "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_suppliers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "b583be02-ab60-4e14-9325-0204f203a1af",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 9 entries, 0 to 8\n",
+ "Data columns (total 9 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 9 non-null int64 \n",
+ " 1 name 9 non-null object \n",
+ " 2 manually_added 9 non-null bool \n",
+ " 3 label 0 non-null float64\n",
+ " 4 itr 0 non-null float64\n",
+ " 5 updated_at 9 non-null object \n",
+ " 6 created_at 9 non-null object \n",
+ " 7 commission 0 non-null float64\n",
+ " 8 identifier 9 non-null object \n",
+ "dtypes: bool(1), float64(3), int64(1), object(4)\n",
+ "memory usage: 713.0+ bytes\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_suppliers.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0.0\n",
+ "name 0.0\n",
+ "manually_added 0.0\n",
+ "label 100.0\n",
+ "itr 100.0\n",
+ "updated_at 0.0\n",
+ "created_at 0.0\n",
+ "commission 100.0\n",
+ "identifier 0.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_suppliers.isna().sum()/len(df1_suppliers)*100"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_619/302783287.py:3: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Selection des variables\n",
+ "df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
+ "df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "4de7e2e2-6da4-4618-8444-b524399c5493",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " supplier_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1617 | \n",
+ " j4 administration | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 8 | \n",
+ " non défini | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4 | \n",
+ " vad | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " fort saint jean | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2 | \n",
+ " j4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 5 | \n",
+ " revendeur | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 3 | \n",
+ " vente en ligne | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 6 | \n",
+ " ccr | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 7 | \n",
+ " dab | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id supplier_name\n",
+ "0 1617 j4 administration\n",
+ "1 8 non défini\n",
+ "2 4 vad\n",
+ "3 1 fort saint jean\n",
+ "4 2 j4\n",
+ "5 5 revendeur\n",
+ "6 3 vente en ligne\n",
+ "7 6 ccr\n",
+ "8 7 dab"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_suppliers_clean"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0a6df975-c7fc-45bc-92af-a0bdab17d795",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "## type_ofs.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "a02f6594-3e91-4e87-bbb6-649c28d4f7e9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " children | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2021-01-05 11:55:51.188106+01:00 | \n",
+ " 2021-01-05 11:55:51.188106+01:00 | \n",
+ " 623ec4067827558b28972cf39fe81ee7 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " Billet en nombre | \n",
+ " pricing_formula | \n",
+ " 2021-01-11 12:13:19.286301+01:00 | \n",
+ " 2021-01-11 12:13:19.286301+01:00 | \n",
+ " a53d313a97296ee37caa066dbfe7a45c | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " Groupe | \n",
+ " pricing_formula | \n",
+ " 2021-01-11 12:19:22.842917+01:00 | \n",
+ " 2021-01-11 12:19:22.842917+01:00 | \n",
+ " 1ab143efc3b85acbbc752fe8eb2b0b86 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " Revendeur | \n",
+ " pricing_formula | \n",
+ " 2021-01-12 12:34:20.481236+01:00 | \n",
+ " 2021-01-12 12:34:20.481236+01:00 | \n",
+ " 8b332723366a07e1eef5f1c92f9ae067 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " Cinéma scolaire | \n",
+ " pricing_formula | \n",
+ " 2021-01-25 19:16:05.141719+01:00 | \n",
+ " 2021-01-25 19:16:05.141719+01:00 | \n",
+ " a12e62cb4c4f47e7406bd8fbff2bfe30 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 6 | \n",
+ " Musée famille | \n",
+ " pricing_formula | \n",
+ " 2021-01-25 19:23:06.692627+01:00 | \n",
+ " 2021-01-25 19:23:06.692627+01:00 | \n",
+ " 1ec6c19283111ccb3ed67f52d414470e | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 7 | \n",
+ " Spectacle famille | \n",
+ " pricing_formula | \n",
+ " 2021-01-25 19:28:21.390016+01:00 | \n",
+ " 2021-01-25 19:28:21.390016+01:00 | \n",
+ " 05e2104f1b74ced229c06847d6e91938 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 8 | \n",
+ " Masterclass | \n",
+ " pricing_formula | \n",
+ " 2021-01-25 19:31:05.076904+01:00 | \n",
+ " 2021-01-25 19:31:05.076904+01:00 | \n",
+ " 9cc946edfb25e11b4282f58db16e6ae9 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 9 | \n",
+ " Spectacle | \n",
+ " pricing_formula | \n",
+ " 2021-01-25 19:38:41.260535+01:00 | \n",
+ " 2021-01-25 19:38:41.260535+01:00 | \n",
+ " d88321c347f0e0ab101184cdf25c94bf | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 10 | \n",
+ " Cinema | \n",
+ " pricing_formula | \n",
+ " 2021-02-05 11:12:31.932576+01:00 | \n",
+ " 2021-02-05 11:12:31.932576+01:00 | \n",
+ " 0870fef2bfcd5b30a12e4f5c7f4aaba7 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 11 | \n",
+ " Musee | \n",
+ " pricing_formula | \n",
+ " 2021-02-05 11:52:05.468207+01:00 | \n",
+ " 2021-02-05 11:52:05.468207+01:00 | \n",
+ " 8ba8934454cc62c7cdb3eb6e1b39df0c | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 12 | \n",
+ " Tarifs plein | \n",
+ " category | \n",
+ " 2023-03-13 11:31:50.528331+01:00 | \n",
+ " 2023-03-13 11:31:50.528331+01:00 | \n",
+ " a6969df76efc15d157be48e87a7bcf9a | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name children created_at \\\n",
+ "0 1 Atelier pricing_formula 2021-01-05 11:55:51.188106+01:00 \n",
+ "1 2 Billet en nombre pricing_formula 2021-01-11 12:13:19.286301+01:00 \n",
+ "2 3 Groupe pricing_formula 2021-01-11 12:19:22.842917+01:00 \n",
+ "3 4 Revendeur pricing_formula 2021-01-12 12:34:20.481236+01:00 \n",
+ "4 5 Cinéma scolaire pricing_formula 2021-01-25 19:16:05.141719+01:00 \n",
+ "5 6 Musée famille pricing_formula 2021-01-25 19:23:06.692627+01:00 \n",
+ "6 7 Spectacle famille pricing_formula 2021-01-25 19:28:21.390016+01:00 \n",
+ "7 8 Masterclass pricing_formula 2021-01-25 19:31:05.076904+01:00 \n",
+ "8 9 Spectacle pricing_formula 2021-01-25 19:38:41.260535+01:00 \n",
+ "9 10 Cinema pricing_formula 2021-02-05 11:12:31.932576+01:00 \n",
+ "10 11 Musee pricing_formula 2021-02-05 11:52:05.468207+01:00 \n",
+ "11 12 Tarifs plein category 2023-03-13 11:31:50.528331+01:00 \n",
+ "\n",
+ " updated_at identifier \n",
+ "0 2021-01-05 11:55:51.188106+01:00 623ec4067827558b28972cf39fe81ee7 \n",
+ "1 2021-01-11 12:13:19.286301+01:00 a53d313a97296ee37caa066dbfe7a45c \n",
+ "2 2021-01-11 12:19:22.842917+01:00 1ab143efc3b85acbbc752fe8eb2b0b86 \n",
+ "3 2021-01-12 12:34:20.481236+01:00 8b332723366a07e1eef5f1c92f9ae067 \n",
+ "4 2021-01-25 19:16:05.141719+01:00 a12e62cb4c4f47e7406bd8fbff2bfe30 \n",
+ "5 2021-01-25 19:23:06.692627+01:00 1ec6c19283111ccb3ed67f52d414470e \n",
+ "6 2021-01-25 19:28:21.390016+01:00 05e2104f1b74ced229c06847d6e91938 \n",
+ "7 2021-01-25 19:31:05.076904+01:00 9cc946edfb25e11b4282f58db16e6ae9 \n",
+ "8 2021-01-25 19:38:41.260535+01:00 d88321c347f0e0ab101184cdf25c94bf \n",
+ "9 2021-02-05 11:12:31.932576+01:00 0870fef2bfcd5b30a12e4f5c7f4aaba7 \n",
+ "10 2021-02-05 11:52:05.468207+01:00 8ba8934454cc62c7cdb3eb6e1b39df0c \n",
+ "11 2023-03-13 11:31:50.528331+01:00 a6969df76efc15d157be48e87a7bcf9a "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_type_ofs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "e9c8d32b-22f4-4581-8af7-31cc1c31fa0e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 12 entries, 0 to 11\n",
+ "Data columns (total 6 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 12 non-null int64 \n",
+ " 1 name 12 non-null object\n",
+ " 2 children 12 non-null object\n",
+ " 3 created_at 12 non-null object\n",
+ " 4 updated_at 12 non-null object\n",
+ " 5 identifier 12 non-null object\n",
+ "dtypes: int64(1), object(5)\n",
+ "memory usage: 704.0+ bytes\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_type_ofs.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_619/81842251.py:3: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Selection des variables\n",
+ "df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
+ "df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "## purchases.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " purchase_date | \n",
+ " customer_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " number | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 5145662 | \n",
+ " 2019-07-17 11:17:53+02:00 | \n",
+ " 6632 | \n",
+ " 2021-12-28 20:48:51.569237+01:00 | \n",
+ " 2021-12-28 20:48:51.569237+01:00 | \n",
+ " fa80c83b29a268b45728c910a8afcf79 | \n",
+ " 82877c41df26f832eb823a83acd1a172 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4941642 | \n",
+ " 2018-10-31 11:59:00+01:00 | \n",
+ " 1 | \n",
+ " 2021-12-28 20:31:48.196681+01:00 | \n",
+ " 2022-03-03 17:52:21.958861+01:00 | \n",
+ " 597b6c06adfe6acc539b29b657b80da0 | \n",
+ " e7102ebe65526c427245533ebabe66e5 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5088860 | \n",
+ " 2018-10-31 12:45:12+01:00 | \n",
+ " 1 | \n",
+ " 2021-12-28 20:46:34.703542+01:00 | \n",
+ " 2021-12-28 20:46:34.703542+01:00 | \n",
+ " 4a7f6baaf9be6a99e3fead7f7e981fa8 | \n",
+ " af75c4ae53d1b6957875538355b162e1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 5088862 | \n",
+ " 2018-10-31 13:07:12+01:00 | \n",
+ " 1 | \n",
+ " 2021-12-28 20:46:34.704773+01:00 | \n",
+ " 2021-12-28 20:46:34.704773+01:00 | \n",
+ " 1d83dfad44b73070d1c6d5875d0edd2d | \n",
+ " 4b2fe34659b177209b07270ae1043b40 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5088863 | \n",
+ " 2018-10-31 13:08:50+01:00 | \n",
+ " 1 | \n",
+ " 2021-12-28 20:46:34.705453+01:00 | \n",
+ " 2021-12-28 20:46:34.705453+01:00 | \n",
+ " 7bfe2bc9c1670c973d0960e3fd408cf8 | \n",
+ " b115f04a99b94df9e4a32185844f0998 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 742245 | \n",
+ " 8007695 | \n",
+ " 2023-11-08 17:51:19+01:00 | \n",
+ " 1256133 | \n",
+ " 2023-11-09 07:51:33.920187+01:00 | \n",
+ " 2023-11-09 07:51:33.920187+01:00 | \n",
+ " 99ad774dedbad43feb73514765d2f0ba | \n",
+ " d68558180b4bf2e8a945724843655775 | \n",
+ "
\n",
+ " \n",
+ " 742246 | \n",
+ " 8007696 | \n",
+ " 2023-11-08 18:17:51+01:00 | \n",
+ " 1256134 | \n",
+ " 2023-11-09 07:51:33.921967+01:00 | \n",
+ " 2023-11-09 07:51:33.921967+01:00 | \n",
+ " c1511614c511c5f95980172690179102 | \n",
+ " f5102d910a7731091f239ad7b0df35b4 | \n",
+ "
\n",
+ " \n",
+ " 742247 | \n",
+ " 8007697 | \n",
+ " 2023-11-08 18:23:54+01:00 | \n",
+ " 1256135 | \n",
+ " 2023-11-09 07:51:33.923034+01:00 | \n",
+ " 2023-11-09 07:51:33.923034+01:00 | \n",
+ " 33b64b39cc53428b4f17d65ff5b93104 | \n",
+ " e2b917626be60cc2c3207cc037fe69e4 | \n",
+ "
\n",
+ " \n",
+ " 742248 | \n",
+ " 8007698 | \n",
+ " 2023-11-08 19:32:18+01:00 | \n",
+ " 1256136 | \n",
+ " 2023-11-09 07:51:33.924135+01:00 | \n",
+ " 2023-11-09 07:51:33.924135+01:00 | \n",
+ " 9ae0b129e704b3d9c093ce9c7c4e5039 | \n",
+ " 5bfa23236c31f8562c3a0233c1b53b31 | \n",
+ "
\n",
+ " \n",
+ " 742249 | \n",
+ " 8007699 | \n",
+ " 2023-11-08 20:30:28+01:00 | \n",
+ " 1256137 | \n",
+ " 2023-11-09 07:51:33.925382+01:00 | \n",
+ " 2023-11-09 07:51:33.925382+01:00 | \n",
+ " d31ced089c2b1f90479257a4686f9306 | \n",
+ " d86b1e0de3ff01eaf04fbcd031ac5fef | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
742250 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id purchase_date customer_id \\\n",
+ "0 5145662 2019-07-17 11:17:53+02:00 6632 \n",
+ "1 4941642 2018-10-31 11:59:00+01:00 1 \n",
+ "2 5088860 2018-10-31 12:45:12+01:00 1 \n",
+ "3 5088862 2018-10-31 13:07:12+01:00 1 \n",
+ "4 5088863 2018-10-31 13:08:50+01:00 1 \n",
+ "... ... ... ... \n",
+ "742245 8007695 2023-11-08 17:51:19+01:00 1256133 \n",
+ "742246 8007696 2023-11-08 18:17:51+01:00 1256134 \n",
+ "742247 8007697 2023-11-08 18:23:54+01:00 1256135 \n",
+ "742248 8007698 2023-11-08 19:32:18+01:00 1256136 \n",
+ "742249 8007699 2023-11-08 20:30:28+01:00 1256137 \n",
+ "\n",
+ " created_at updated_at \\\n",
+ "0 2021-12-28 20:48:51.569237+01:00 2021-12-28 20:48:51.569237+01:00 \n",
+ "1 2021-12-28 20:31:48.196681+01:00 2022-03-03 17:52:21.958861+01:00 \n",
+ "2 2021-12-28 20:46:34.703542+01:00 2021-12-28 20:46:34.703542+01:00 \n",
+ "3 2021-12-28 20:46:34.704773+01:00 2021-12-28 20:46:34.704773+01:00 \n",
+ "4 2021-12-28 20:46:34.705453+01:00 2021-12-28 20:46:34.705453+01:00 \n",
+ "... ... ... \n",
+ "742245 2023-11-09 07:51:33.920187+01:00 2023-11-09 07:51:33.920187+01:00 \n",
+ "742246 2023-11-09 07:51:33.921967+01:00 2023-11-09 07:51:33.921967+01:00 \n",
+ "742247 2023-11-09 07:51:33.923034+01:00 2023-11-09 07:51:33.923034+01:00 \n",
+ "742248 2023-11-09 07:51:33.924135+01:00 2023-11-09 07:51:33.924135+01:00 \n",
+ "742249 2023-11-09 07:51:33.925382+01:00 2023-11-09 07:51:33.925382+01:00 \n",
+ "\n",
+ " number identifier \n",
+ "0 fa80c83b29a268b45728c910a8afcf79 82877c41df26f832eb823a83acd1a172 \n",
+ "1 597b6c06adfe6acc539b29b657b80da0 e7102ebe65526c427245533ebabe66e5 \n",
+ "2 4a7f6baaf9be6a99e3fead7f7e981fa8 af75c4ae53d1b6957875538355b162e1 \n",
+ "3 1d83dfad44b73070d1c6d5875d0edd2d 4b2fe34659b177209b07270ae1043b40 \n",
+ "4 7bfe2bc9c1670c973d0960e3fd408cf8 b115f04a99b94df9e4a32185844f0998 \n",
+ "... ... ... \n",
+ "742245 99ad774dedbad43feb73514765d2f0ba d68558180b4bf2e8a945724843655775 \n",
+ "742246 c1511614c511c5f95980172690179102 f5102d910a7731091f239ad7b0df35b4 \n",
+ "742247 33b64b39cc53428b4f17d65ff5b93104 e2b917626be60cc2c3207cc037fe69e4 \n",
+ "742248 9ae0b129e704b3d9c093ce9c7c4e5039 5bfa23236c31f8562c3a0233c1b53b31 \n",
+ "742249 d31ced089c2b1f90479257a4686f9306 d86b1e0de3ff01eaf04fbcd031ac5fef \n",
+ "\n",
+ "[742250 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_purchases"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 742250 entries, 0 to 742249\n",
+ "Data columns (total 7 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 742250 non-null int64 \n",
+ " 1 purchase_date 742250 non-null object\n",
+ " 2 customer_id 742250 non-null int64 \n",
+ " 3 created_at 742250 non-null object\n",
+ " 4 updated_at 742250 non-null object\n",
+ " 5 number 742250 non-null object\n",
+ " 6 identifier 742250 non-null object\n",
+ "dtypes: int64(2), object(5)\n",
+ "memory usage: 39.6+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_purchases.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Nettoyage purchase_date\n",
+ "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
+ "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "27d18584-228f-4698-85d6-4d23151ea5ed",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 742250 entries, 0 to 742249\n",
+ "Data columns (total 7 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 742250 non-null int64 \n",
+ " 1 purchase_date 742250 non-null datetime64[ns, UTC]\n",
+ " 2 customer_id 742250 non-null int64 \n",
+ " 3 created_at 742250 non-null object \n",
+ " 4 updated_at 742250 non-null object \n",
+ " 5 number 742250 non-null object \n",
+ " 6 identifier 742250 non-null object \n",
+ "dtypes: datetime64[ns, UTC](1), int64(2), object(4)\n",
+ "memory usage: 39.6+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_purchases.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Selection des variables\n",
+ "df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "53227600-c1c5-48aa-9f5d-db5a23a8a22a",
+ "metadata": {},
+ "source": [
+ "## Fusion de l'ensemble des données billétiques"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "e0b8b47a-b321-4a79-823c-36a131a78ac7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fusion avec fournisseurs\n",
+ "df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
+ "df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
+ "\n",
+ "# Fusion avec type de tickets\n",
+ "df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
+ "df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
+ "\n",
+ "# Fusion avec achats\n",
+ "df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
+ "df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ticket_id | \n",
+ " product_id | \n",
+ " is_from_subscription | \n",
+ " supplier_name | \n",
+ " type_of_ticket_name | \n",
+ " children | \n",
+ " purchase_date | \n",
+ " customer_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 13070859 | \n",
+ " 225251 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 13070860 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13070861 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 13070862 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 13070863 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1826667 | \n",
+ " 20662815 | \n",
+ " 405689 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2023-11-08 17:23:54+00:00 | \n",
+ " 1256135 | \n",
+ "
\n",
+ " \n",
+ " 1826668 | \n",
+ " 20662816 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2023-11-08 18:32:18+00:00 | \n",
+ " 1256136 | \n",
+ "
\n",
+ " \n",
+ " 1826669 | \n",
+ " 20662817 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2023-11-08 18:32:18+00:00 | \n",
+ " 1256136 | \n",
+ "
\n",
+ " \n",
+ " 1826670 | \n",
+ " 20662818 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2023-11-08 19:30:28+00:00 | \n",
+ " 1256137 | \n",
+ "
\n",
+ " \n",
+ " 1826671 | \n",
+ " 20662819 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " vente en ligne | \n",
+ " Atelier | \n",
+ " pricing_formula | \n",
+ " 2023-11-08 19:30:28+00:00 | \n",
+ " 1256137 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1826672 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ticket_id product_id is_from_subscription supplier_name \\\n",
+ "0 13070859 225251 False vente en ligne \n",
+ "1 13070860 224914 False vente en ligne \n",
+ "2 13070861 224914 False vente en ligne \n",
+ "3 13070862 224914 False vente en ligne \n",
+ "4 13070863 224914 False vente en ligne \n",
+ "... ... ... ... ... \n",
+ "1826667 20662815 405689 False vente en ligne \n",
+ "1826668 20662816 403658 False vente en ligne \n",
+ "1826669 20662817 403658 False vente en ligne \n",
+ "1826670 20662818 403658 False vente en ligne \n",
+ "1826671 20662819 403658 False vente en ligne \n",
+ "\n",
+ " type_of_ticket_name children purchase_date \\\n",
+ "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
+ "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
+ "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
+ "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
+ "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n",
+ "... ... ... ... \n",
+ "1826667 Atelier pricing_formula 2023-11-08 17:23:54+00:00 \n",
+ "1826668 Atelier pricing_formula 2023-11-08 18:32:18+00:00 \n",
+ "1826669 Atelier pricing_formula 2023-11-08 18:32:18+00:00 \n",
+ "1826670 Atelier pricing_formula 2023-11-08 19:30:28+00:00 \n",
+ "1826671 Atelier pricing_formula 2023-11-08 19:30:28+00:00 \n",
+ "\n",
+ " customer_id \n",
+ "0 48187 \n",
+ "1 48187 \n",
+ "2 48187 \n",
+ "3 48187 \n",
+ "4 48187 \n",
+ "... ... \n",
+ "1826667 1256135 \n",
+ "1826668 1256136 \n",
+ "1826669 1256136 \n",
+ "1826670 1256137 \n",
+ "1826671 1256137 \n",
+ "\n",
+ "[1826672 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_ticket_information"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75",
+ "metadata": {},
+ "source": [
+ "# Utilisation de fonctions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "d237be96-8c86-4a91-b7a1-487e87a16c3d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def cleaning_date(df, column_name):\n",
+ " \"\"\"\n",
+ " Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
+ "\n",
+ " Parameters:\n",
+ " - df: DataFrame\n",
+ " Le DataFrame contenant la colonne à nettoyer.\n",
+ " - column_name: str\n",
+ " Le nom de la colonne à nettoyer.\n",
+ "\n",
+ " Returns:\n",
+ " - DataFrame\n",
+ " Le DataFrame modifié avec la colonne nettoyée.\n",
+ " \"\"\"\n",
+ " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "c1afe322-ff41-4760-819e-0195fed5b27d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 20 entries, 0 to 19\n",
+ "Data columns (total 2 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 opened_at 8 non-null object \n",
+ " 1 opened_at_clean 8 non-null datetime64[ns, UTC]\n",
+ "dtypes: datetime64[ns, UTC](1), object(1)\n",
+ "memory usage: 448.0+ bytes\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Créer un DataFrame exemple\n",
+ "df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
+ "\n",
+ "# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
+ "df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
+ "df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
+ "\n",
+ "test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
+ "\n",
+ "test.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "27ecf058-23eb-4018-abbd-68c4ebe7c786",
+ "metadata": {},
+ "source": [
+ "## Nettoyage, selection et fusion"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "d887898c-6a21-41ed-901d-4d6fdbca5372",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ticket_id | \n",
+ " product_id | \n",
+ " is_from_subscription | \n",
+ " type_of | \n",
+ " supplier_name | \n",
+ " purchase_date | \n",
+ " customer_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 13070859 | \n",
+ " 225251 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 13070860 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13070861 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 13070862 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 13070863 | \n",
+ " 224914 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2018-12-28 14:47:50+00:00 | \n",
+ " 48187 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1826667 | \n",
+ " 20662815 | \n",
+ " 405689 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 17:23:54+00:00 | \n",
+ " 1256135 | \n",
+ "
\n",
+ " \n",
+ " 1826668 | \n",
+ " 20662816 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 18:32:18+00:00 | \n",
+ " 1256136 | \n",
+ "
\n",
+ " \n",
+ " 1826669 | \n",
+ " 20662817 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 18:32:18+00:00 | \n",
+ " 1256136 | \n",
+ "
\n",
+ " \n",
+ " 1826670 | \n",
+ " 20662818 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 19:30:28+00:00 | \n",
+ " 1256137 | \n",
+ "
\n",
+ " \n",
+ " 1826671 | \n",
+ " 20662819 | \n",
+ " 403658 | \n",
+ " False | \n",
+ " 1 | \n",
+ " vente en ligne | \n",
+ " 2023-11-08 19:30:28+00:00 | \n",
+ " 1256137 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1826672 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ticket_id product_id is_from_subscription type_of supplier_name \\\n",
+ "0 13070859 225251 False 1 vente en ligne \n",
+ "1 13070860 224914 False 1 vente en ligne \n",
+ "2 13070861 224914 False 1 vente en ligne \n",
+ "3 13070862 224914 False 1 vente en ligne \n",
+ "4 13070863 224914 False 1 vente en ligne \n",
+ "... ... ... ... ... ... \n",
+ "1826667 20662815 405689 False 1 vente en ligne \n",
+ "1826668 20662816 403658 False 1 vente en ligne \n",
+ "1826669 20662817 403658 False 1 vente en ligne \n",
+ "1826670 20662818 403658 False 1 vente en ligne \n",
+ "1826671 20662819 403658 False 1 vente en ligne \n",
+ "\n",
+ " purchase_date customer_id \n",
+ "0 2018-12-28 14:47:50+00:00 48187 \n",
+ "1 2018-12-28 14:47:50+00:00 48187 \n",
+ "2 2018-12-28 14:47:50+00:00 48187 \n",
+ "3 2018-12-28 14:47:50+00:00 48187 \n",
+ "4 2018-12-28 14:47:50+00:00 48187 \n",
+ "... ... ... \n",
+ "1826667 2023-11-08 17:23:54+00:00 1256135 \n",
+ "1826668 2023-11-08 18:32:18+00:00 1256136 \n",
+ "1826669 2023-11-08 18:32:18+00:00 1256136 \n",
+ "1826670 2023-11-08 19:30:28+00:00 1256137 \n",
+ "1826671 2023-11-08 19:30:28+00:00 1256137 \n",
+ "\n",
+ "[1826672 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_ticket_information"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "ac9a6373-c1c6-46b5-873b-dc22f17bcbdb",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 1826672 entries, 0 to 1826671\n",
+ "Data columns (total 7 columns):\n",
+ " # Column Dtype \n",
+ "--- ------ ----- \n",
+ " 0 ticket_id int64 \n",
+ " 1 product_id int64 \n",
+ " 2 is_from_subscription bool \n",
+ " 3 type_of int64 \n",
+ " 4 supplier_name object \n",
+ " 5 purchase_date datetime64[ns, UTC]\n",
+ " 6 customer_id int64 \n",
+ "dtypes: bool(1), datetime64[ns, UTC](1), int64(4), object(1)\n",
+ "memory usage: 85.4+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_ticket_information.info()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b1719943-89eb-4ba0-a107-2f96d5d01ec9",
+ "metadata": {},
+ "source": [
+ "# Customer information"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a2132ee2-3f22-45fd-b65b-72689c8b672c",
+ "metadata": {},
+ "source": [
+ "## Target area"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "id": "da5d4708-7147-4cc8-8686-52d4bcba5a7a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_619/2625134041.py:3: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Target.csv cleaning\n",
+ "df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
+ "df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
+ "\n",
+ "# target_type cleaning\n",
+ "df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
+ "\n",
+ "#customer_target_mappings cleaning\n",
+ "df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
+ "\n",
+ "# Merge target et target_type\n",
+ "df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
+ "df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
+ "\n",
+ "# Merge\n",
+ "df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
+ "df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "b4fa5fe3-ce8e-4b0a-af94-fb468d241bad",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 5.080902\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
+ "len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
+ "\n",
+ "# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
+ "df1_targets_test.mean()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "8072bbb7-1360-4882-bb2b-2f43b6beea0d",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " customer_id | \n",
+ " target_name | \n",
+ " target_type_is_import | \n",
+ " target_type_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 8793 | \n",
+ " 4584599 | \n",
+ " 1 | \n",
+ " consentement optin jeune public | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 13249 | \n",
+ " 4567465 | \n",
+ " 1 | \n",
+ " DDCP rentrée culturelle 2023 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 21424 | \n",
+ " 4544805 | \n",
+ " 1 | \n",
+ " spectateurs cine dimanche_cine concert_2122 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 21665 | \n",
+ " 4544911 | \n",
+ " 1 | \n",
+ " DDCP Cine 2023 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 22811 | \n",
+ " 4545766 | \n",
+ " 1 | \n",
+ " DDCP OLBJ! 2023 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 57305 | \n",
+ " 4457909 | \n",
+ " 1 | \n",
+ " ddcp_promo_visiteurs occasionnels_musee_8mois | \n",
+ " False | \n",
+ " manual_dynamic_filter | \n",
+ "
\n",
+ " \n",
+ " 58843 | \n",
+ " 3688872 | \n",
+ " 1 | \n",
+ " DDCP promo livemag | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 66813 | \n",
+ " 4313646 | \n",
+ " 1 | \n",
+ " DDCP spectateurs Classique mais pas que 2022 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 68367 | \n",
+ " 4547662 | \n",
+ " 1 | \n",
+ " ddcp_promo_musee_au moins 3 achats_dps8mois | \n",
+ " False | \n",
+ " manual_dynamic_filter | \n",
+ "
\n",
+ " \n",
+ " 77320 | \n",
+ " 4285520 | \n",
+ " 1 | \n",
+ " DDCP spectateurs Iminente | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 84350 | \n",
+ " 4037805 | \n",
+ " 1 | \n",
+ " DDCP spectateurs Marseille Jazz 18-19-21 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 85383 | \n",
+ " 4569504 | \n",
+ " 1 | \n",
+ " DDCP rendez-vous de septembre offre spéciale | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 92868 | \n",
+ " 4433064 | \n",
+ " 1 | \n",
+ " ddcp_promo_plein air_ateliers_jardins | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 99670 | \n",
+ " 3858684 | \n",
+ " 1 | \n",
+ " Acid Arab | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 105477 | \n",
+ " 4321810 | \n",
+ " 1 | \n",
+ " Arenametrix_bascule tel vers sib | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 169513 | \n",
+ " 3697992 | \n",
+ " 1 | \n",
+ " ddcp_achats billets nb dps 19052021 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 214421 | \n",
+ " 2925324 | \n",
+ " 1 | \n",
+ " consentement optout scolaires | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 234546 | \n",
+ " 4575957 | \n",
+ " 1 | \n",
+ " Portrait de Leila shahid | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 259808 | \n",
+ " 3722259 | \n",
+ " 1 | \n",
+ " consentement optin b2b | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 274380 | \n",
+ " 4510423 | \n",
+ " 1 | \n",
+ " DDCP_marseille_jazz_2023 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 307511 | \n",
+ " 5174466 | \n",
+ " 1 | \n",
+ " ddcp actoral 21-22 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 357509 | \n",
+ " 4442526 | \n",
+ " 1 | \n",
+ " ddcp musique barvalo | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 392920 | \n",
+ " 4390642 | \n",
+ " 1 | \n",
+ " ddcp_md_promo_spectateurs theatre contempo | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 449620 | \n",
+ " 4411897 | \n",
+ " 1 | \n",
+ " FORMATION _ acheteurs optin last year | \n",
+ " False | \n",
+ " manual_dynamic_filter | \n",
+ "
\n",
+ " \n",
+ " 503809 | \n",
+ " 4734591 | \n",
+ " 1 | \n",
+ " consentement optin mediation specialisee | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 651222 | \n",
+ " 3554426 | \n",
+ " 1 | \n",
+ " consentement optin b2c | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 654246 | \n",
+ " 5182212 | \n",
+ " 1 | \n",
+ " DDCP spectateurs Festival de Marseille 2023 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 654395 | \n",
+ " 5182456 | \n",
+ " 1 | \n",
+ " rencontres_echelle_spectateurs_2021_2023 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id customer_id target_name \\\n",
+ "8793 4584599 1 consentement optin jeune public \n",
+ "13249 4567465 1 DDCP rentrée culturelle 2023 \n",
+ "21424 4544805 1 spectateurs cine dimanche_cine concert_2122 \n",
+ "21665 4544911 1 DDCP Cine 2023 \n",
+ "22811 4545766 1 DDCP OLBJ! 2023 \n",
+ "57305 4457909 1 ddcp_promo_visiteurs occasionnels_musee_8mois \n",
+ "58843 3688872 1 DDCP promo livemag \n",
+ "66813 4313646 1 DDCP spectateurs Classique mais pas que 2022 \n",
+ "68367 4547662 1 ddcp_promo_musee_au moins 3 achats_dps8mois \n",
+ "77320 4285520 1 DDCP spectateurs Iminente \n",
+ "84350 4037805 1 DDCP spectateurs Marseille Jazz 18-19-21 \n",
+ "85383 4569504 1 DDCP rendez-vous de septembre offre spéciale \n",
+ "92868 4433064 1 ddcp_promo_plein air_ateliers_jardins \n",
+ "99670 3858684 1 Acid Arab \n",
+ "105477 4321810 1 Arenametrix_bascule tel vers sib \n",
+ "169513 3697992 1 ddcp_achats billets nb dps 19052021 \n",
+ "214421 2925324 1 consentement optout scolaires \n",
+ "234546 4575957 1 Portrait de Leila shahid \n",
+ "259808 3722259 1 consentement optin b2b \n",
+ "274380 4510423 1 DDCP_marseille_jazz_2023 \n",
+ "307511 5174466 1 ddcp actoral 21-22 \n",
+ "357509 4442526 1 ddcp musique barvalo \n",
+ "392920 4390642 1 ddcp_md_promo_spectateurs theatre contempo \n",
+ "449620 4411897 1 FORMATION _ acheteurs optin last year \n",
+ "503809 4734591 1 consentement optin mediation specialisee \n",
+ "651222 3554426 1 consentement optin b2c \n",
+ "654246 5182212 1 DDCP spectateurs Festival de Marseille 2023 \n",
+ "654395 5182456 1 rencontres_echelle_spectateurs_2021_2023 \n",
+ "\n",
+ " target_type_is_import target_type_name \n",
+ "8793 False manual_static_filter \n",
+ "13249 False manual_static_filter \n",
+ "21424 False manual_static_filter \n",
+ "21665 False manual_static_filter \n",
+ "22811 False manual_static_filter \n",
+ "57305 False manual_dynamic_filter \n",
+ "58843 False manual_static_filter \n",
+ "66813 False manual_static_filter \n",
+ "68367 False manual_dynamic_filter \n",
+ "77320 False manual_static_filter \n",
+ "84350 False manual_static_filter \n",
+ "85383 False manual_static_filter \n",
+ "92868 False manual_static_filter \n",
+ "99670 False manual_static_filter \n",
+ "105477 False manual_static_filter \n",
+ "169513 False manual_static_filter \n",
+ "214421 False manual_static_filter \n",
+ "234546 False manual_static_filter \n",
+ "259808 False manual_static_filter \n",
+ "274380 False manual_static_filter \n",
+ "307511 False manual_static_filter \n",
+ "357509 False manual_static_filter \n",
+ "392920 False manual_static_filter \n",
+ "449620 False manual_dynamic_filter \n",
+ "503809 False manual_static_filter \n",
+ "651222 False manual_static_filter \n",
+ "654246 False manual_static_filter \n",
+ "654395 False manual_static_filter "
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_targets_full[df1_targets_full['customer_id'] == 1]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2f665824-a026-4acd-8358-b408a61854b4",
+ "metadata": {},
+ "source": [
+ "## Campaign area"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "id": "5d05203c-ea30-4208-a29f-fef7737c672e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
+ "/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
+ "/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n"
+ ]
+ }
+ ],
+ "source": [
+ "# campaign_stats cleaning \n",
+ "df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
+ "cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
+ "cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
+ "cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
+ "\n",
+ "# campaigns cleaning\n",
+ "df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
+ "cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
+ "\n",
+ "# Merge \n",
+ "df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
+ "df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "8ac634cf-2a30-4ccc-a34d-0fd401a49aaa",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 6214808 entries, 0 to 6214807\n",
+ "Data columns (total 8 columns):\n",
+ " # Column Dtype \n",
+ "--- ------ ----- \n",
+ " 0 id int64 \n",
+ " 1 customer_id int64 \n",
+ " 2 opened_at datetime64[ns, UTC]\n",
+ " 3 sent_at datetime64[ns, UTC]\n",
+ " 4 delivered_at datetime64[ns, UTC]\n",
+ " 5 campaign_name object \n",
+ " 6 campaign_service_id int64 \n",
+ " 7 campaign_sent_at datetime64[ns, UTC]\n",
+ "dtypes: datetime64[ns, UTC](4), int64(3), object(1)\n",
+ "memory usage: 379.3+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_campaigns_full.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "7d22cdd5-2060-4922-8e04-27b613d4ee27",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " customer_id | \n",
+ " opened_at | \n",
+ " sent_at | \n",
+ " delivered_at | \n",
+ " campaign_name | \n",
+ " campaign_service_id | \n",
+ " campaign_sent_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 19793 | \n",
+ " 112597 | \n",
+ " NaT | \n",
+ " 2021-03-28 16:01:09+00:00 | \n",
+ " 2021-03-28 16:24:18+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 14211 | \n",
+ " 113666 | \n",
+ " NaT | \n",
+ " 2021-03-28 16:01:09+00:00 | \n",
+ " 2021-03-28 16:21:02+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13150 | \n",
+ " 280561 | \n",
+ " NaT | \n",
+ " 2021-03-28 16:00:59+00:00 | \n",
+ " 2021-03-28 16:08:45+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 7073 | \n",
+ " 101007 | \n",
+ " 2021-03-28 18:11:06+00:00 | \n",
+ " 2021-03-28 16:00:59+00:00 | \n",
+ " 2021-03-28 16:09:47+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5175 | \n",
+ " 103972 | \n",
+ " NaT | \n",
+ " 2021-03-28 16:01:06+00:00 | \n",
+ " 2021-03-28 16:05:03+00:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-27 23:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 6214803 | \n",
+ " 8302994 | \n",
+ " 266155 | \n",
+ " 2023-10-23 09:43:25+00:00 | \n",
+ " 2023-10-23 09:32:33+00:00 | \n",
+ " 2023-10-23 09:32:34+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ " 6214804 | \n",
+ " 8303307 | \n",
+ " 21355 | \n",
+ " 2023-10-23 09:44:02+00:00 | \n",
+ " 2023-10-23 09:32:49+00:00 | \n",
+ " 2023-10-23 09:32:49+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ " 6214805 | \n",
+ " 8304346 | \n",
+ " 21849 | \n",
+ " 2023-10-23 09:45:52+00:00 | \n",
+ " 2023-10-23 09:33:28+00:00 | \n",
+ " 2023-10-23 09:33:29+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ " 6214806 | \n",
+ " 8302037 | \n",
+ " 667789 | \n",
+ " 2023-10-23 09:47:32+00:00 | \n",
+ " 2023-10-23 09:31:53+00:00 | \n",
+ " 2023-10-23 09:31:54+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ " 6214807 | \n",
+ " 8304939 | \n",
+ " 294154 | \n",
+ " NaT | \n",
+ " 2023-10-23 09:33:54+00:00 | \n",
+ " 2023-10-23 09:33:55+00:00 | \n",
+ " dre_nov_2023 | \n",
+ " 1318 | \n",
+ " 2023-10-23 09:31:17+00:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
6214808 rows × 8 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id customer_id opened_at \\\n",
+ "0 19793 112597 NaT \n",
+ "1 14211 113666 NaT \n",
+ "2 13150 280561 NaT \n",
+ "3 7073 101007 2021-03-28 18:11:06+00:00 \n",
+ "4 5175 103972 NaT \n",
+ "... ... ... ... \n",
+ "6214803 8302994 266155 2023-10-23 09:43:25+00:00 \n",
+ "6214804 8303307 21355 2023-10-23 09:44:02+00:00 \n",
+ "6214805 8304346 21849 2023-10-23 09:45:52+00:00 \n",
+ "6214806 8302037 667789 2023-10-23 09:47:32+00:00 \n",
+ "6214807 8304939 294154 NaT \n",
+ "\n",
+ " sent_at delivered_at \\\n",
+ "0 2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00 \n",
+ "1 2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00 \n",
+ "2 2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00 \n",
+ "3 2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00 \n",
+ "4 2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00 \n",
+ "... ... ... \n",
+ "6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00 \n",
+ "6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00 \n",
+ "6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00 \n",
+ "6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00 \n",
+ "6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00 \n",
+ "\n",
+ " campaign_name campaign_service_id \\\n",
+ "0 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "1 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "2 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "3 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "4 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "... ... ... \n",
+ "6214803 dre_nov_2023 1318 \n",
+ "6214804 dre_nov_2023 1318 \n",
+ "6214805 dre_nov_2023 1318 \n",
+ "6214806 dre_nov_2023 1318 \n",
+ "6214807 dre_nov_2023 1318 \n",
+ "\n",
+ " campaign_sent_at \n",
+ "0 2021-03-27 23:00:00+00:00 \n",
+ "1 2021-03-27 23:00:00+00:00 \n",
+ "2 2021-03-27 23:00:00+00:00 \n",
+ "3 2021-03-27 23:00:00+00:00 \n",
+ "4 2021-03-27 23:00:00+00:00 \n",
+ "... ... \n",
+ "6214803 2023-10-23 09:31:17+00:00 \n",
+ "6214804 2023-10-23 09:31:17+00:00 \n",
+ "6214805 2023-10-23 09:31:17+00:00 \n",
+ "6214806 2023-10-23 09:31:17+00:00 \n",
+ "6214807 2023-10-23 09:31:17+00:00 \n",
+ "\n",
+ "[6214808 rows x 8 columns]"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_campaigns_information"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0a5b24f0-4bca-4cde-a6ba-eb130b38cac4",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "## Link area"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "bc63bc4e-6cc1-4d35-9635-faf55339e186",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " service_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " process_id | \n",
+ " report_url | \n",
+ " category | \n",
+ " to_be_synced | \n",
+ " identifier | \n",
+ " sent_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1319613 | \n",
+ " newsletter enseignants janvier 2022 | \n",
+ " 721 | \n",
+ " 2022-01-14 16:06:42.586321+01:00 | \n",
+ " 2022-02-03 14:17:27.112963+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " aba3b6fd5d186d28e06ff97135cade7f | \n",
+ " 2022-01-14 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1319586 | \n",
+ " lsf_janvier_2022 | \n",
+ " 717 | \n",
+ " 2022-01-07 11:30:35.315895+01:00 | \n",
+ " 2022-02-03 14:17:27.116171+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 788d986905533aba051261497ecffcbb | \n",
+ " 2022-01-07 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1319282 | \n",
+ " Invitation à déjeuner au Mucem | Vernissage « ... | \n",
+ " 591 | \n",
+ " 2021-09-28 12:50:24.448752+02:00 | \n",
+ " 2022-02-03 14:17:27.119582+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 3493894fa4ea036cfc6433c3e2ee63b0 | \n",
+ " 2021-09-28 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1319283 | \n",
+ " Vacances de la Toussaint - centres des loisirs | \n",
+ " 590 | \n",
+ " 2021-09-28 18:01:04.692073+02:00 | \n",
+ " 2022-02-03 14:17:27.124408+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 08b255a5d42b89b0585260b6f2360bdd | \n",
+ " 2021-09-28 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1319636 | \n",
+ " ddcp_promo_md_livemag | \n",
+ " 730 | \n",
+ " 2022-01-27 18:00:41.053069+01:00 | \n",
+ " 2022-02-03 14:17:27.127607+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " d5cfead94f5350c12c322b5b664544c1 | \n",
+ " 2022-01-27 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 952 | \n",
+ " 1320072 | \n",
+ " dre_gaza0106 | \n",
+ " 881 | \n",
+ " 2022-05-26 09:01:35.523639+02:00 | \n",
+ " 2022-12-02 17:51:22.614046+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 7504adad8bb96320eb3afdd4df6e1f60 | \n",
+ " 2022-05-26 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 953 | \n",
+ " 661398 | \n",
+ " DDCP Plan Bis 4 - Marketing direct - MJ5C | \n",
+ " 183 | \n",
+ " 2021-06-18 10:30:01.259578+02:00 | \n",
+ " 2021-09-24 11:56:09.082785+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " cedebb6e872f539bef8c3f919874e9d7 | \n",
+ " 2020-07-27 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 954 | \n",
+ " 1320487 | \n",
+ " Invitation portes ouvertes amitiés | \n",
+ " 988 | \n",
+ " 2022-09-29 18:01:33.834090+02:00 | \n",
+ " 2022-12-02 17:51:23.258324+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 9908279ebbf1f9b250ba689db6a0222b | \n",
+ " 2022-09-29 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 955 | \n",
+ " 906903 | \n",
+ " DDCP PROMO La méditerranée des philosophes #3 ... | \n",
+ " 310 | \n",
+ " 2021-07-19 14:07:16.177390+02:00 | \n",
+ " 2021-09-24 11:56:09.086101+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 06eb61b839a0cefee4967c67ccb099dc | \n",
+ " 2020-12-23 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 956 | \n",
+ " 579313 | \n",
+ " ddcp_promo_automation_manuel_pre_visit | \n",
+ " 481 | \n",
+ " 2021-06-08 17:38:54.041310+02:00 | \n",
+ " 2021-09-24 11:56:09.089394+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 9461cce28ebe3e76fb4b931c35a169b0 | \n",
+ " 2021-06-08 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
957 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name service_id \\\n",
+ "0 1319613 newsletter enseignants janvier 2022 721 \n",
+ "1 1319586 lsf_janvier_2022 717 \n",
+ "2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n",
+ "3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n",
+ "4 1319636 ddcp_promo_md_livemag 730 \n",
+ ".. ... ... ... \n",
+ "952 1320072 dre_gaza0106 881 \n",
+ "953 661398 DDCP Plan Bis 4 - Marketing direct - MJ5C 183 \n",
+ "954 1320487 Invitation portes ouvertes amitiés 988 \n",
+ "955 906903 DDCP PROMO La méditerranée des philosophes #3 ... 310 \n",
+ "956 579313 ddcp_promo_automation_manuel_pre_visit 481 \n",
+ "\n",
+ " created_at updated_at \\\n",
+ "0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n",
+ "1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n",
+ "2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n",
+ "3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n",
+ "4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n",
+ ".. ... ... \n",
+ "952 2022-05-26 09:01:35.523639+02:00 2022-12-02 17:51:22.614046+01:00 \n",
+ "953 2021-06-18 10:30:01.259578+02:00 2021-09-24 11:56:09.082785+02:00 \n",
+ "954 2022-09-29 18:01:33.834090+02:00 2022-12-02 17:51:23.258324+01:00 \n",
+ "955 2021-07-19 14:07:16.177390+02:00 2021-09-24 11:56:09.086101+02:00 \n",
+ "956 2021-06-08 17:38:54.041310+02:00 2021-09-24 11:56:09.089394+02:00 \n",
+ "\n",
+ " process_id report_url category to_be_synced \\\n",
+ "0 NaN NaN 0.0 False \n",
+ "1 NaN NaN 0.0 False \n",
+ "2 NaN NaN 0.0 False \n",
+ "3 NaN NaN 0.0 False \n",
+ "4 NaN NaN 0.0 False \n",
+ ".. ... ... ... ... \n",
+ "952 NaN NaN 0.0 False \n",
+ "953 NaN NaN 0.0 False \n",
+ "954 NaN NaN 0.0 False \n",
+ "955 NaN NaN 0.0 False \n",
+ "956 NaN NaN 0.0 False \n",
+ "\n",
+ " identifier sent_at \n",
+ "0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n",
+ "1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n",
+ "2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n",
+ "3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n",
+ "4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 \n",
+ ".. ... ... \n",
+ "952 7504adad8bb96320eb3afdd4df6e1f60 2022-05-26 00:00:00+02:00 \n",
+ "953 cedebb6e872f539bef8c3f919874e9d7 2020-07-27 00:00:00+02:00 \n",
+ "954 9908279ebbf1f9b250ba689db6a0222b 2022-09-29 00:00:00+02:00 \n",
+ "955 06eb61b839a0cefee4967c67ccb099dc 2020-12-23 00:00:00+01:00 \n",
+ "956 9461cce28ebe3e76fb4b931c35a169b0 2021-06-08 00:00:00+02:00 \n",
+ "\n",
+ "[957 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 37,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_campaigns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "c19b321f-65f9-4d6c-8c1f-edb2eb9d70e7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " clicked_at | \n",
+ " link_id | \n",
+ " customer_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2021-03-26 16:30:36+01:00 | \n",
+ " 1 | \n",
+ " 284033 | \n",
+ " 2021-03-26 15:30:37.050161+01:00 | \n",
+ " 2021-03-26 15:30:37.050161+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2021-03-26 17:16:34+01:00 | \n",
+ " 2 | \n",
+ " 119768 | \n",
+ " 2021-03-26 16:16:34.950871+01:00 | \n",
+ " 2021-03-26 16:16:34.950871+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 272 | \n",
+ " 2021-03-28 20:03:32+02:00 | \n",
+ " 42 | \n",
+ " 113105 | \n",
+ " 2021-03-28 18:03:32.736394+02:00 | \n",
+ " 2021-03-28 18:03:32.736394+02:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 2021-03-26 17:43:19+01:00 | \n",
+ " 3 | \n",
+ " 272280 | \n",
+ " 2021-03-26 16:43:19.338321+01:00 | \n",
+ " 2021-03-26 16:43:19.338321+01:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 2021-03-26 17:46:00+01:00 | \n",
+ " 3 | \n",
+ " 105095 | \n",
+ " 2021-03-26 16:46:00.502945+01:00 | \n",
+ " 2021-03-26 16:46:00.502945+01:00 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 151046 | \n",
+ " 243553 | \n",
+ " 2023-11-09 16:34:27+01:00 | \n",
+ " 14666 | \n",
+ " 998 | \n",
+ " 2023-11-09 15:34:29.425425+01:00 | \n",
+ " 2023-11-09 15:34:29.425425+01:00 | \n",
+ "
\n",
+ " \n",
+ " 151047 | \n",
+ " 243554 | \n",
+ " 2023-11-09 16:34:35+01:00 | \n",
+ " 14670 | \n",
+ " 998 | \n",
+ " 2023-11-09 15:34:37.505505+01:00 | \n",
+ " 2023-11-09 15:34:37.505505+01:00 | \n",
+ "
\n",
+ " \n",
+ " 151048 | \n",
+ " 243559 | \n",
+ " 2023-11-09 16:51:15+01:00 | \n",
+ " 14686 | \n",
+ " 82923 | \n",
+ " 2023-11-09 15:51:17.439518+01:00 | \n",
+ " 2023-11-09 15:51:17.439518+01:00 | \n",
+ "
\n",
+ " \n",
+ " 151049 | \n",
+ " 243561 | \n",
+ " 2023-11-09 16:59:42+01:00 | \n",
+ " 14677 | \n",
+ " 82923 | \n",
+ " 2023-11-09 15:59:44.030922+01:00 | \n",
+ " 2023-11-09 15:59:44.030922+01:00 | \n",
+ "
\n",
+ " \n",
+ " 151050 | \n",
+ " 243564 | \n",
+ " 2023-11-09 17:16:41+01:00 | \n",
+ " 14691 | \n",
+ " 1254355 | \n",
+ " 2023-11-09 16:16:43.012932+01:00 | \n",
+ " 2023-11-09 16:16:43.012932+01:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
151051 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id clicked_at link_id customer_id \\\n",
+ "0 1 2021-03-26 16:30:36+01:00 1 284033 \n",
+ "1 2 2021-03-26 17:16:34+01:00 2 119768 \n",
+ "2 272 2021-03-28 20:03:32+02:00 42 113105 \n",
+ "3 4 2021-03-26 17:43:19+01:00 3 272280 \n",
+ "4 5 2021-03-26 17:46:00+01:00 3 105095 \n",
+ "... ... ... ... ... \n",
+ "151046 243553 2023-11-09 16:34:27+01:00 14666 998 \n",
+ "151047 243554 2023-11-09 16:34:35+01:00 14670 998 \n",
+ "151048 243559 2023-11-09 16:51:15+01:00 14686 82923 \n",
+ "151049 243561 2023-11-09 16:59:42+01:00 14677 82923 \n",
+ "151050 243564 2023-11-09 17:16:41+01:00 14691 1254355 \n",
+ "\n",
+ " created_at updated_at \n",
+ "0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n",
+ "1 2021-03-26 16:16:34.950871+01:00 2021-03-26 16:16:34.950871+01:00 \n",
+ "2 2021-03-28 18:03:32.736394+02:00 2021-03-28 18:03:32.736394+02:00 \n",
+ "3 2021-03-26 16:43:19.338321+01:00 2021-03-26 16:43:19.338321+01:00 \n",
+ "4 2021-03-26 16:46:00.502945+01:00 2021-03-26 16:46:00.502945+01:00 \n",
+ "... ... ... \n",
+ "151046 2023-11-09 15:34:29.425425+01:00 2023-11-09 15:34:29.425425+01:00 \n",
+ "151047 2023-11-09 15:34:37.505505+01:00 2023-11-09 15:34:37.505505+01:00 \n",
+ "151048 2023-11-09 15:51:17.439518+01:00 2023-11-09 15:51:17.439518+01:00 \n",
+ "151049 2023-11-09 15:59:44.030922+01:00 2023-11-09 15:59:44.030922+01:00 \n",
+ "151050 2023-11-09 16:16:43.012932+01:00 2023-11-09 16:16:43.012932+01:00 \n",
+ "\n",
+ "[151051 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_link_stats"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "## Exploration variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "aaa41688-ea7e-4dba-851c-1f0b0ec43c71",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
+ "def suppliers_exploration(suppliers = None) : \n",
+ " \n",
+ " # Taux de NaN pour ces colonnes\n",
+ " label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
+ " itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
+ " commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
+ "\n",
+ " suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
+ " 'label_na' : [label_na],\n",
+ " 'itr_na' : [itr_na],\n",
+ " 'commission_na' : [commission_na]})\n",
+ "\n",
+ " return suppliers_desc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "2fecc2e1-113f-46ed-9065-0b9ee416166e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "55f6170a-36fb-4efb-9810-f982883660cf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " nb_suppliers | \n",
+ " label_na | \n",
+ " itr_na | \n",
+ " commission_na | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 9 | \n",
+ " 100.0 | \n",
+ " 100.0 | \n",
+ " 100.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " nb_suppliers label_na itr_na commission_na\n",
+ "0 9 100.0 100.0 100.0"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_suppliers_desc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "0030fd02-09e3-42f5-9c83-290458a38c29",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "BUCKET = \"bdc2324-data\"\n",
+ "liste_folders = fs.ls(BUCKET)\n",
+ "\n",
+ "liste_files = []\n",
+ "for company_folder in liste_folders : \n",
+ " liste_files.extend(fs.ls(company_folder))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "6b1736d1-8fd7-4fcc-9431-b8bf0c7b4f2b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
+ ]
+ }
+ ],
+ "source": [
+ "liste_database_select = ['suppliers']\n",
+ "\n",
+ "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
+ "liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
+ "\n",
+ "# Afficher le résultat\n",
+ "print(liste_suppliers)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "226b694b-0b00-4167-b69f-3178902254eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# loop to create dataframes from file 2\n",
+ "def database_loading(database_name = None):\n",
+ " files_path = database_name\n",
+ " \n",
+ " client_number = files_path[0].split(\"/\")[1]\n",
+ " df_prefix = \"df\" + str(client_number) + \"_\"\n",
+ " \n",
+ " for i in range(len(files_path)) :\n",
+ " current_path = files_path[i]\n",
+ " with fs.open(current_path, mode=\"rb\") as file_in:\n",
+ " df = pd.read_csv(file_in)\n",
+ " # the pattern of the name is df1xxx\n",
+ " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
+ " globals()[nom_dataframe] = df\n",
+ "\n",
+ " "
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Notebook_AJ.ipynb b/Notebook_AJ.ipynb
deleted file mode 100644
index 19272b5..0000000
--- a/Notebook_AJ.ipynb
+++ /dev/null
@@ -1,823 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
- "metadata": {},
- "source": [
- "# Business Data Challenge - Team 1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import numpy as np"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
- "metadata": {},
- "source": [
- "Configuration de l'accès aux données"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['bdc2324-data/1',\n",
- " 'bdc2324-data/10',\n",
- " 'bdc2324-data/101',\n",
- " 'bdc2324-data/11',\n",
- " 'bdc2324-data/12',\n",
- " 'bdc2324-data/13',\n",
- " 'bdc2324-data/14',\n",
- " 'bdc2324-data/2',\n",
- " 'bdc2324-data/3',\n",
- " 'bdc2324-data/4',\n",
- " 'bdc2324-data/5',\n",
- " 'bdc2324-data/6',\n",
- " 'bdc2324-data/7',\n",
- " 'bdc2324-data/8',\n",
- " 'bdc2324-data/9']"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import os\n",
- "import s3fs\n",
- "# Create filesystem object\n",
- "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
- "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
- "\n",
- "BUCKET = \"bdc2324-data\"\n",
- "fs.ls(BUCKET)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Chargement des fichiers campaign_stats.csv\n",
- "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Conversion des dates 'sent_at'\n",
- "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
- "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
- "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2023-11-09 18:10:45+00:00\n",
- "2020-06-02 08:24:08+00:00\n",
- "2023-10-12 01:39:48+00:00\n",
- "2023-10-10 17:06:29+00:00\n",
- "2023-11-01 09:20:48+00:00\n",
- "2021-03-31 14:59:02+00:00\n"
- ]
- }
- ],
- "source": [
- "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
- "print(campaign_stats_1['sent_at'].max())\n",
- "print(campaign_stats_1['sent_at'].min())\n",
- "\n",
- "print(campaign_stats_2['sent_at'].max())\n",
- "print(campaign_stats_2['sent_at'].min())\n",
- "\n",
- "print(campaign_stats_3['sent_at'].max())\n",
- "print(campaign_stats_3['sent_at'].min())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 2021-03-28 16:01:09+00:00\n",
- "1 2021-03-28 16:01:09+00:00\n",
- "2 2021-03-28 16:00:59+00:00\n",
- "3 2021-03-28 16:00:59+00:00\n",
- "4 2021-03-28 16:01:06+00:00\n",
- " ... \n",
- "6214803 2023-10-23 09:32:33+00:00\n",
- "6214804 2023-10-23 09:32:49+00:00\n",
- "6214805 2023-10-23 09:33:28+00:00\n",
- "6214806 2023-10-23 09:31:53+00:00\n",
- "6214807 2023-10-23 09:33:54+00:00\n",
- "Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "campaign_stats_1['sent_at']"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "31f2edbf-5661-4516-9835-06d4da615c13",
- "metadata": {},
- "source": [
- "### Customersplus.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
- " customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
- ]
- }
- ],
- "source": [
- "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
- "\n",
- "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
- "\n",
- "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "id": "460f853a-68c0-42a7-9877-b83d3aaec813",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
- " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
- " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
- " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
- " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
- " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
- " 'average_purchase_delay', 'average_price_basket',\n",
- " 'average_ticket_basket', 'total_price', 'preferred_category',\n",
- " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
- " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
- " 'tenant_id'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "customers_plus_1.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
- "metadata": {},
- "outputs": [],
- "source": [
- "customers_plus_1.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
- "metadata": {},
- "outputs": [],
- "source": [
- "customers_plus_1['id'].nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
- "metadata": {},
- "outputs": [],
- "source": [
- "customers_plus_2['id'].nunique()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b40a653e-013f-48d0-8b57-0284587b36c5",
- "metadata": {},
- "outputs": [],
- "source": [
- "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "id": "32fa2215-3c79-40b5-8643-755865959fc7",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "1"
- ]
- },
- "execution_count": 61,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
- "# Exemple id commun = caractéristiques communes\n",
- "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
- "\n",
- "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "id 0.000000\n",
- "lastname 43.461341\n",
- "firstname 44.995588\n",
- "birthdate 96.419870\n",
- "email 8.622075\n",
- "street_id 0.000000\n",
- "created_at 0.000000\n",
- "updated_at 0.000000\n",
- "civility 100.000000\n",
- "is_partner 0.000000\n",
- "extra 100.000000\n",
- "deleted_at 100.000000\n",
- "reference 100.000000\n",
- "gender 0.000000\n",
- "is_email_true 0.000000\n",
- "extra_field 100.000000\n",
- "identifier 0.000000\n",
- "opt_in 0.000000\n",
- "structure_id 88.072380\n",
- "note 99.403421\n",
- "profession 95.913503\n",
- "language 99.280945\n",
- "mcp_contact_id 34.876141\n",
- "need_reload 0.000000\n",
- "last_buying_date 51.653431\n",
- "max_price 51.653431\n",
- "ticket_sum 0.000000\n",
- "average_price 8.639195\n",
- "fidelity 0.000000\n",
- "average_purchase_delay 51.653431\n",
- "average_price_basket 51.653431\n",
- "average_ticket_basket 51.653431\n",
- "total_price 43.014236\n",
- "preferred_category 100.000000\n",
- "preferred_supplier 100.000000\n",
- "preferred_formula 100.000000\n",
- "purchase_count 0.000000\n",
- "first_buying_date 51.653431\n",
- "last_visiting_date 100.000000\n",
- "zipcode 71.176564\n",
- "country 5.459418\n",
- "age 96.419870\n",
- "tenant_id 0.000000\n",
- "dtype: float64\n"
- ]
- }
- ],
- "source": [
- "pd.DataFrame(customers_plus_1.isna().mean()*100)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "6f6ce60d-0912-497d-9108-330acccef394",
- "metadata": {},
- "outputs": [],
- "source": [
- "# Chargement de toutes les données\n",
- "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
- "\n",
- "for nom_base in liste_base:\n",
- " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
- " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " lastname | \n",
- " firstname | \n",
- " birthdate | \n",
- " email | \n",
- " street_id | \n",
- " created_at | \n",
- " updated_at | \n",
- " civility | \n",
- " is_partner | \n",
- " ... | \n",
- " tenant_id | \n",
- " id_x | \n",
- " customer_id | \n",
- " purchase_date | \n",
- " type_of | \n",
- " is_from_subscription | \n",
- " amount | \n",
- " is_full_price | \n",
- " start_date_time | \n",
- " event_name | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 405082 | \n",
- " lastname405082 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 6 | \n",
- " 2023-01-12 06:30:31.197484+01:00 | \n",
- " 2023-01-12 06:30:31.197484+01:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 992423 | \n",
- " 405082 | \n",
- " 2023-01-11 17:08:41+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 13.0 | \n",
- " False | \n",
- " 2023-02-06 20:00:00+01:00 | \n",
- " zaide | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 405082 | \n",
- " lastname405082 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 6 | \n",
- " 2023-01-12 06:30:31.197484+01:00 | \n",
- " 2023-01-12 06:30:31.197484+01:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 992423 | \n",
- " 405082 | \n",
- " 2023-01-11 17:08:41+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 13.0 | \n",
- " False | \n",
- " 2023-02-06 20:00:00+01:00 | \n",
- " zaide | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 411168 | \n",
- " lastname411168 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 6 | \n",
- " 2023-03-17 06:30:35.431967+01:00 | \n",
- " 2023-03-17 06:30:35.431967+01:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 1053934 | \n",
- " 411168 | \n",
- " 2023-03-16 16:23:10+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 62.0 | \n",
- " False | \n",
- " 2023-03-19 16:00:00+01:00 | \n",
- " luisa miller | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 411168 | \n",
- " lastname411168 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 6 | \n",
- " 2023-03-17 06:30:35.431967+01:00 | \n",
- " 2023-03-17 06:30:35.431967+01:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 1053934 | \n",
- " 411168 | \n",
- " 2023-03-16 16:23:10+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 62.0 | \n",
- " False | \n",
- " 2023-03-19 16:00:00+01:00 | \n",
- " luisa miller | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 4380 | \n",
- " lastname4380 | \n",
- " firstname4380 | \n",
- " NaN | \n",
- " NaN | \n",
- " 1 | \n",
- " 2021-04-22 14:51:55.432952+02:00 | \n",
- " 2022-04-14 11:41:33.738500+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 1189141 | \n",
- " 4380 | \n",
- " 2020-11-26 13:12:53+01:00 | \n",
- " 3 | \n",
- " False | \n",
- " 51.3 | \n",
- " False | \n",
- " 2020-12-01 20:00:00+01:00 | \n",
- " iphigenie en tauride | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 318964 | \n",
- " 19095 | \n",
- " lastname19095 | \n",
- " firstname19095 | \n",
- " 1979-07-16 | \n",
- " email19095 | \n",
- " 6 | \n",
- " 2021-04-22 15:06:30.120537+02:00 | \n",
- " 2023-09-12 18:27:36.904104+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 1090839 | \n",
- " 19095 | \n",
- " 2019-05-19 21:18:36+02:00 | \n",
- " 1 | \n",
- " False | \n",
- " 4.5 | \n",
- " False | \n",
- " 2019-05-27 20:00:00+02:00 | \n",
- " entre femmes | \n",
- "
\n",
- " \n",
- " 318965 | \n",
- " 19095 | \n",
- " lastname19095 | \n",
- " firstname19095 | \n",
- " 1979-07-16 | \n",
- " email19095 | \n",
- " 6 | \n",
- " 2021-04-22 15:06:30.120537+02:00 | \n",
- " 2023-09-12 18:27:36.904104+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 1090839 | \n",
- " 19095 | \n",
- " 2019-05-19 21:18:36+02:00 | \n",
- " 1 | \n",
- " False | \n",
- " 4.5 | \n",
- " False | \n",
- " 2019-05-27 20:00:00+02:00 | \n",
- " entre femmes | \n",
- "
\n",
- " \n",
- " 318966 | \n",
- " 19095 | \n",
- " lastname19095 | \n",
- " firstname19095 | \n",
- " 1979-07-16 | \n",
- " email19095 | \n",
- " 6 | \n",
- " 2021-04-22 15:06:30.120537+02:00 | \n",
- " 2023-09-12 18:27:36.904104+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 1090839 | \n",
- " 19095 | \n",
- " 2019-05-19 21:18:36+02:00 | \n",
- " 1 | \n",
- " False | \n",
- " 4.5 | \n",
- " False | \n",
- " 2019-05-27 20:00:00+02:00 | \n",
- " entre femmes | \n",
- "
\n",
- " \n",
- " 318967 | \n",
- " 19095 | \n",
- " lastname19095 | \n",
- " firstname19095 | \n",
- " 1979-07-16 | \n",
- " email19095 | \n",
- " 6 | \n",
- " 2021-04-22 15:06:30.120537+02:00 | \n",
- " 2023-09-12 18:27:36.904104+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 1244277 | \n",
- " 19095 | \n",
- " 2019-12-31 11:04:07+01:00 | \n",
- " 1 | \n",
- " False | \n",
- " 5.5 | \n",
- " False | \n",
- " 2020-02-03 20:00:00+01:00 | \n",
- " a boire et a manger | \n",
- "
\n",
- " \n",
- " 318968 | \n",
- " 19095 | \n",
- " lastname19095 | \n",
- " firstname19095 | \n",
- " 1979-07-16 | \n",
- " email19095 | \n",
- " 6 | \n",
- " 2021-04-22 15:06:30.120537+02:00 | \n",
- " 2023-09-12 18:27:36.904104+02:00 | \n",
- " NaN | \n",
- " False | \n",
- " ... | \n",
- " 1556 | \n",
- " 1244277 | \n",
- " 19095 | \n",
- " 2019-12-31 11:04:07+01:00 | \n",
- " 1 | \n",
- " False | \n",
- " 5.5 | \n",
- " False | \n",
- " 2020-02-03 20:00:00+01:00 | \n",
- " a boire et a manger | \n",
- "
\n",
- " \n",
- "
\n",
- "
318969 rows × 52 columns
\n",
- "
"
- ],
- "text/plain": [
- " id lastname firstname birthdate email \\\n",
- "0 405082 lastname405082 NaN NaN NaN \n",
- "1 405082 lastname405082 NaN NaN NaN \n",
- "2 411168 lastname411168 NaN NaN NaN \n",
- "3 411168 lastname411168 NaN NaN NaN \n",
- "4 4380 lastname4380 firstname4380 NaN NaN \n",
- "... ... ... ... ... ... \n",
- "318964 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
- "318965 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
- "318966 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
- "318967 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
- "318968 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
- "\n",
- " street_id created_at \\\n",
- "0 6 2023-01-12 06:30:31.197484+01:00 \n",
- "1 6 2023-01-12 06:30:31.197484+01:00 \n",
- "2 6 2023-03-17 06:30:35.431967+01:00 \n",
- "3 6 2023-03-17 06:30:35.431967+01:00 \n",
- "4 1 2021-04-22 14:51:55.432952+02:00 \n",
- "... ... ... \n",
- "318964 6 2021-04-22 15:06:30.120537+02:00 \n",
- "318965 6 2021-04-22 15:06:30.120537+02:00 \n",
- "318966 6 2021-04-22 15:06:30.120537+02:00 \n",
- "318967 6 2021-04-22 15:06:30.120537+02:00 \n",
- "318968 6 2021-04-22 15:06:30.120537+02:00 \n",
- "\n",
- " updated_at civility is_partner ... \\\n",
- "0 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
- "1 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
- "2 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
- "3 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
- "4 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
- "... ... ... ... ... \n",
- "318964 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
- "318965 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
- "318966 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
- "318967 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
- "318968 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
- "\n",
- " tenant_id id_x customer_id purchase_date type_of \\\n",
- "0 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
- "1 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
- "2 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
- "3 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
- "4 1556 1189141 4380 2020-11-26 13:12:53+01:00 3 \n",
- "... ... ... ... ... ... \n",
- "318964 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
- "318965 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
- "318966 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
- "318967 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
- "318968 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
- "\n",
- " is_from_subscription amount is_full_price start_date_time \\\n",
- "0 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
- "1 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
- "2 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
- "3 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
- "4 False 51.3 False 2020-12-01 20:00:00+01:00 \n",
- "... ... ... ... ... \n",
- "318964 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
- "318965 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
- "318966 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
- "318967 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
- "318968 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
- "\n",
- " event_name \n",
- "0 zaide \n",
- "1 zaide \n",
- "2 luisa miller \n",
- "3 luisa miller \n",
- "4 iphigenie en tauride \n",
- "... ... \n",
- "318964 entre femmes \n",
- "318965 entre femmes \n",
- "318966 entre femmes \n",
- "318967 a boire et a manger \n",
- "318968 a boire et a manger \n",
- "\n",
- "[318969 rows x 52 columns]"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Jointure\n",
- "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
- "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
- "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
- "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
- "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
- "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
- "df_customer_event"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/Notebook_AR.ipynb b/Notebook_AR.ipynb
index 9107796..18b06d1 100644
--- a/Notebook_AR.ipynb
+++ b/Notebook_AR.ipynb
@@ -6103,6 +6103,403 @@
"representation_theme.head()"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "e274e3cc-1b41-43e0-8412-1563166060cb",
+ "metadata": {},
+ "source": [
+ "## Price Table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "id": "c52621e7-01de-48dc-b572-2974542a8be5",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1product_packs.csv\n",
+ "Shape : (1, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'identifier', 'name', 'type_of'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " type_of | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name type_of\n",
+ "0 1 NaN 0"
+ ]
+ },
+ "execution_count": 112,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "product_packs = load_dataset(\"1product_packs.csv\")\n",
+ "product_packs.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 114,
+ "id": "9e4f60ab-9a2c-4090-b0c4-f9a1530b2d39",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1pricing_formulas.csv\n",
+ "Shape : (556, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'identifier', 'name', 'extra_field'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " extra_field | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 41909 | \n",
+ " visite mécènes 1h30 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 502 | \n",
+ " entree mucem tp( expo picasso) | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 504 | \n",
+ " nombre de personnes cinema | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 117 | \n",
+ " spectacle tarif e famille tr | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1496 | \n",
+ " billet nb famille mecene 1a | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name extra_field\n",
+ "0 41909 visite mécènes 1h30 NaN\n",
+ "1 502 entree mucem tp( expo picasso) NaN\n",
+ "2 504 nombre de personnes cinema NaN\n",
+ "3 117 spectacle tarif e famille tr NaN\n",
+ "4 1496 billet nb famille mecene 1a NaN"
+ ]
+ },
+ "execution_count": 114,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pricing_formula = load_dataset(\"1pricing_formulas.csv\")\n",
+ "pricing_formula.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 115,
+ "id": "247b5c45-a18a-4cfd-86b4-d3453e157bcd",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1type_of_pricing_formulas.csv\n",
+ "Shape : (568, 6)\n",
+ "Number of columns : 4\n",
+ "Columns : Index(['id', 'type_of_id', 'pricing_formula_id', 'identifier'], dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " type_of_id | \n",
+ " pricing_formula_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 127 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2425 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2937 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 48 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id type_of_id pricing_formula_id\n",
+ "0 1 1 127\n",
+ "1 2 1 2425\n",
+ "2 3 1 2937\n",
+ "3 4 1 48\n",
+ "4 5 1 7"
+ ]
+ },
+ "execution_count": 115,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "type_pricing_formula = load_dataset(\"1type_of_pricing_formulas.csv\")\n",
+ "type_pricing_formula.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 117,
+ "id": "4b48f7b3-0f06-4ef6-9355-5016af82f49c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File path : bdc2324-data/1/1products_groups.csv\n",
+ "Shape : (92973, 9)\n",
+ "Number of columns : 7\n",
+ "Columns : Index(['id', 'category_id', 'pricing_formula_id', 'representation_id',\n",
+ " 'percent_price', 'max_price', 'min_price'],\n",
+ " dtype='object')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " category_id | \n",
+ " pricing_formula_id | \n",
+ " representation_id | \n",
+ " percent_price | \n",
+ " max_price | \n",
+ " min_price | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 2735 | \n",
+ " 8 | \n",
+ " 97 | \n",
+ " 1534 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 156773 | \n",
+ " 5 | \n",
+ " 9 | \n",
+ " 82519 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 14387 | \n",
+ " 16 | \n",
+ " 79 | \n",
+ " 8046 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2770 | \n",
+ " 2 | \n",
+ " 37 | \n",
+ " 1563 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 27179 | \n",
+ " 13 | \n",
+ " 119 | \n",
+ " 14192 | \n",
+ " 100.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id category_id pricing_formula_id representation_id percent_price \\\n",
+ "0 2735 8 97 1534 100.0 \n",
+ "1 156773 5 9 82519 100.0 \n",
+ "2 14387 16 79 8046 100.0 \n",
+ "3 2770 2 37 1563 100.0 \n",
+ "4 27179 13 119 14192 100.0 \n",
+ "\n",
+ " max_price min_price \n",
+ "0 0.0 0.0 \n",
+ "1 0.0 0.0 \n",
+ "2 0.0 0.0 \n",
+ "3 0.0 0.0 \n",
+ "4 0.0 0.0 "
+ ]
+ },
+ "execution_count": 117,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "product_groups = load_dataset(\"1products_groups.csv\")\n",
+ "product_groups.head()"
+ ]
+ },
{
"cell_type": "markdown",
"id": "71c26a38-6818-42df-8aee-0135681a5563",
@@ -6741,6 +7138,9 @@
"outputs": [],
"source": [
"def uniform_product_df():\n",
+ " \"\"\"\n",
+ " This function returns the uniform product dataset\n",
+ " \"\"\"\n",
" print(\"Products theme columns : \", products_theme.columns)\n",
" print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
" print(\"\\n Events theme columns : \", events_theme.columns)\n",
diff --git a/TP_merge_tables_clean.ipynb b/TP_merge_tables_clean.ipynb
new file mode 100644
index 0000000..66b5228
--- /dev/null
+++ b/TP_merge_tables_clean.ipynb
@@ -0,0 +1,1760 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "8c56d518-3634-4492-b249-0d8ef33dd527",
+ "metadata": {},
+ "source": [
+ "## First steps : package importations, set up working environment and import data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "dede42d9-1262-45f7-bd7a-586ae800092a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# importations\n",
+ "\n",
+ "import os \n",
+ "import s3fs\n",
+ "import pandas as pd\n",
+ "import re\n",
+ "from datetime import datetime, timezone, timedelta\n",
+ "import math\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "6ce34b58-b5ba-4b54-ba4d-fc82ef01b09c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['bdc2324-data/1',\n",
+ " 'bdc2324-data/10',\n",
+ " 'bdc2324-data/101',\n",
+ " 'bdc2324-data/11',\n",
+ " 'bdc2324-data/12',\n",
+ " 'bdc2324-data/13',\n",
+ " 'bdc2324-data/14',\n",
+ " 'bdc2324-data/2',\n",
+ " 'bdc2324-data/3',\n",
+ " 'bdc2324-data/4',\n",
+ " 'bdc2324-data/5',\n",
+ " 'bdc2324-data/6',\n",
+ " 'bdc2324-data/7',\n",
+ " 'bdc2324-data/8',\n",
+ " 'bdc2324-data/9']"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# bucket for accessing the data\n",
+ "\n",
+ "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
+ "\n",
+ "fs = s3fs.S3FileSystem(client_kwargs = {\"endpoint_url\" : S3_ENDPOINT_URL})\n",
+ "BUCKET = \"bdc2324-data\"\n",
+ "fs.ls(BUCKET)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "8eb13dd3-53c7-4a70-94a4-846168473aa1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['bdc2324-data/1/1campaign_stats.csv',\n",
+ " 'bdc2324-data/1/1campaigns.csv',\n",
+ " 'bdc2324-data/1/1categories.csv',\n",
+ " 'bdc2324-data/1/1countries.csv',\n",
+ " 'bdc2324-data/1/1currencies.csv',\n",
+ " 'bdc2324-data/1/1customer_target_mappings.csv',\n",
+ " 'bdc2324-data/1/1customersplus.csv',\n",
+ " 'bdc2324-data/1/1event_types.csv',\n",
+ " 'bdc2324-data/1/1events.csv',\n",
+ " 'bdc2324-data/1/1facilities.csv',\n",
+ " 'bdc2324-data/1/1link_stats.csv',\n",
+ " 'bdc2324-data/1/1pricing_formulas.csv',\n",
+ " 'bdc2324-data/1/1product_packs.csv',\n",
+ " 'bdc2324-data/1/1products.csv',\n",
+ " 'bdc2324-data/1/1products_groups.csv',\n",
+ " 'bdc2324-data/1/1purchases.csv',\n",
+ " 'bdc2324-data/1/1representation_category_capacities.csv',\n",
+ " 'bdc2324-data/1/1representations.csv',\n",
+ " 'bdc2324-data/1/1seasons.csv',\n",
+ " 'bdc2324-data/1/1structure_tag_mappings.csv',\n",
+ " 'bdc2324-data/1/1suppliers.csv',\n",
+ " 'bdc2324-data/1/1tags.csv',\n",
+ " 'bdc2324-data/1/1target_types.csv',\n",
+ " 'bdc2324-data/1/1targets.csv',\n",
+ " 'bdc2324-data/1/1tickets.csv',\n",
+ " 'bdc2324-data/1/1type_of_categories.csv',\n",
+ " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n",
+ " 'bdc2324-data/1/1type_ofs.csv']"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "FILE_PATH_S3 = fs.ls(BUCKET)[0] # focus on the company number 1\n",
+ "files_path = fs.ls(FILE_PATH_S3)\n",
+ "files_path"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "1ea66c4e-1307-4f19-836e-3104fba2ff41",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "1\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_487/2894332003.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " df = pd.read_csv(file_in)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# loop to create dataframes related to company 1\n",
+ "\n",
+ "client_number = files_path[0].split(\"/\")[1]\n",
+ "print(client_number)\n",
+ "df_prefix = \"df\" + str(client_number) + \"_\"\n",
+ "\n",
+ "for i in range(len(files_path)) :\n",
+ " current_path = files_path[i]\n",
+ " with fs.open(current_path, mode=\"rb\") as file_in:\n",
+ " df = pd.read_csv(file_in)\n",
+ " # the pattern of the name is df1xxx\n",
+ " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
+ " globals()[nom_dataframe] = df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "13d70b2c-6580-4caf-b839-10f72b2e0b39",
+ "metadata": {},
+ "source": [
+ "## Target, target types and customer target mapping"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "4dbc7fea-ac3b-4348-83fb-dfb1a460f936",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " is_import | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 69 | \n",
+ " False | \n",
+ " manual_dynamic_filter | \n",
+ " 2020-11-30 09:46:18.881030+01:00 | \n",
+ " 2020-11-30 09:46:18.881030+01:00 | \n",
+ " e0f4b8693184850fefd6d2a38f10584e | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 48 | \n",
+ " True | \n",
+ " manual_structure | \n",
+ " 2020-11-04 17:16:19.548275+01:00 | \n",
+ " 2020-11-04 17:16:19.548275+01:00 | \n",
+ " 382bca214204a2d3462f5ec2728d5d1e | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " True | \n",
+ " manual_import | \n",
+ " 2020-10-14 18:37:40.521623+02:00 | \n",
+ " 2020-10-14 18:37:40.521623+02:00 | \n",
+ " 12213df2ce68a624e4c0070521437bac | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 56 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ " 2020-11-04 18:08:37.233486+01:00 | \n",
+ " 2020-11-04 18:08:37.233486+01:00 | \n",
+ " fb27e81baa4debc6a4e1a8639c20e808 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id is_import name created_at \\\n",
+ "0 69 False manual_dynamic_filter 2020-11-30 09:46:18.881030+01:00 \n",
+ "1 48 True manual_structure 2020-11-04 17:16:19.548275+01:00 \n",
+ "2 1 True manual_import 2020-10-14 18:37:40.521623+02:00 \n",
+ "3 56 False manual_static_filter 2020-11-04 18:08:37.233486+01:00 \n",
+ "\n",
+ " updated_at identifier \n",
+ "0 2020-11-30 09:46:18.881030+01:00 e0f4b8693184850fefd6d2a38f10584e \n",
+ "1 2020-11-04 17:16:19.548275+01:00 382bca214204a2d3462f5ec2728d5d1e \n",
+ "2 2020-10-14 18:37:40.521623+02:00 12213df2ce68a624e4c0070521437bac \n",
+ "3 2020-11-04 18:08:37.233486+01:00 fb27e81baa4debc6a4e1a8639c20e808 "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 1. target types\n",
+ "df1_target_types.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "0e9f5dcb-0dc3-4052-b866-e5c4cb954a1f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " target_type_id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 217 | \n",
+ " 56 | \n",
+ " DDCP PROMO Art contemporain - salle de chauffe... | \n",
+ " 2021-01-04 15:00:05.401899+01:00 | \n",
+ " 2021-03-02 18:38:19.025969+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 701 | \n",
+ " 56 | \n",
+ " consentement optin scolaires | \n",
+ " 2021-12-21 16:03:59.840785+01:00 | \n",
+ " 2022-02-18 17:23:44.761388+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 134 | \n",
+ " 56 | \n",
+ " DDCP Newsletter jeune public | \n",
+ " 2020-11-10 09:43:19.667471+01:00 | \n",
+ " 2021-03-02 18:38:19.052304+01:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 700 | \n",
+ " 56 | \n",
+ " consentement optout scolaires | \n",
+ " 2021-12-21 16:01:57.524946+01:00 | \n",
+ " 2022-02-18 17:23:44.807776+01:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 964 | \n",
+ " 56 | \n",
+ " DDCP achat billet nbr dep 19052021 | \n",
+ " 2022-04-14 10:58:17.142834+02:00 | \n",
+ " 2022-04-14 10:58:23.677264+02:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id target_type_id name \\\n",
+ "0 217 56 DDCP PROMO Art contemporain - salle de chauffe... \n",
+ "1 701 56 consentement optin scolaires \n",
+ "2 134 56 DDCP Newsletter jeune public \n",
+ "3 700 56 consentement optout scolaires \n",
+ "4 964 56 DDCP achat billet nbr dep 19052021 \n",
+ "\n",
+ " created_at updated_at \n",
+ "0 2021-01-04 15:00:05.401899+01:00 2021-03-02 18:38:19.025969+01:00 \n",
+ "1 2021-12-21 16:03:59.840785+01:00 2022-02-18 17:23:44.761388+01:00 \n",
+ "2 2020-11-10 09:43:19.667471+01:00 2021-03-02 18:38:19.052304+01:00 \n",
+ "3 2021-12-21 16:01:57.524946+01:00 2022-02-18 17:23:44.807776+01:00 \n",
+ "4 2022-04-14 10:58:17.142834+02:00 2022-04-14 10:58:23.677264+02:00 "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 2. targets\n",
+ "df1_targets.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "c5c62302-370a-462f-bd79-eac31593f65c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " customer_id | \n",
+ " target_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " name | \n",
+ " extra_field | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1184824 | \n",
+ " 645400 | \n",
+ " 130 | \n",
+ " 2021-09-23 09:35:47.617275+02:00 | \n",
+ " 2021-09-23 09:35:47.617275+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1184825 | \n",
+ " 645400 | \n",
+ " 345 | \n",
+ " 2021-09-23 09:35:47.668846+02:00 | \n",
+ " 2021-09-23 09:35:47.668846+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1184828 | \n",
+ " 645402 | \n",
+ " 126 | \n",
+ " 2021-09-23 12:02:51.253269+02:00 | \n",
+ " 2021-09-23 12:02:51.253269+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1184829 | \n",
+ " 645403 | \n",
+ " 126 | \n",
+ " 2021-09-23 12:20:47.394480+02:00 | \n",
+ " 2021-09-23 12:20:47.394480+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1295770 | \n",
+ " 647301 | \n",
+ " 346 | \n",
+ " 2021-09-28 16:02:29.372608+02:00 | \n",
+ " 2021-09-28 16:02:29.372608+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id customer_id target_id created_at \\\n",
+ "0 1184824 645400 130 2021-09-23 09:35:47.617275+02:00 \n",
+ "1 1184825 645400 345 2021-09-23 09:35:47.668846+02:00 \n",
+ "2 1184828 645402 126 2021-09-23 12:02:51.253269+02:00 \n",
+ "3 1184829 645403 126 2021-09-23 12:20:47.394480+02:00 \n",
+ "4 1295770 647301 346 2021-09-28 16:02:29.372608+02:00 \n",
+ "\n",
+ " updated_at name extra_field \n",
+ "0 2021-09-23 09:35:47.617275+02:00 NaN NaN \n",
+ "1 2021-09-23 09:35:47.668846+02:00 NaN NaN \n",
+ "2 2021-09-23 12:02:51.253269+02:00 NaN NaN \n",
+ "3 2021-09-23 12:20:47.394480+02:00 NaN NaN \n",
+ "4 2021-09-28 16:02:29.372608+02:00 NaN NaN "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 3. customer target mapping\n",
+ "\n",
+ "df1_customer_target_mappings.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "1a87cebf-c1dd-408d-a523-26633419da1e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " target_type_id | \n",
+ " name | \n",
+ " target_type_is_import | \n",
+ " target_type_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 217 | \n",
+ " 56 | \n",
+ " DDCP PROMO Art contemporain - salle de chauffe... | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 701 | \n",
+ " 56 | \n",
+ " consentement optin scolaires | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 134 | \n",
+ " 56 | \n",
+ " DDCP Newsletter jeune public | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 700 | \n",
+ " 56 | \n",
+ " consentement optout scolaires | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 964 | \n",
+ " 56 | \n",
+ " DDCP achat billet nbr dep 19052021 | \n",
+ " False | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id target_type_id name \\\n",
+ "0 217 56 DDCP PROMO Art contemporain - salle de chauffe... \n",
+ "1 701 56 consentement optin scolaires \n",
+ "2 134 56 DDCP Newsletter jeune public \n",
+ "3 700 56 consentement optout scolaires \n",
+ "4 964 56 DDCP achat billet nbr dep 19052021 \n",
+ "\n",
+ " target_type_is_import target_type_name \n",
+ "0 False manual_static_filter \n",
+ "1 False manual_static_filter \n",
+ "2 False manual_static_filter \n",
+ "3 False manual_static_filter \n",
+ "4 False manual_static_filter "
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 4.1. merge target with target type\n",
+ "\n",
+ "df1_targets_full = pd.merge(df1_targets[[\"id\", \"target_type_id\", \"name\"]], df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\"), left_on='target_type_id', right_on='target_type_id', how='left')\n",
+ "df1_targets_full.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "d48c1fff-73c2-4e75-8799-da2b80694be7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 4.2. merge df1_customer_target_mappings with df1_targets_full\n",
+ "\n",
+ "# change the position of the column target type id\n",
+ "\n",
+ "# Spécifiez le nom de la colonne à déplacer et la colonne après laquelle vous souhaitez la placer\n",
+ "column_to_move = 'target_type_id'\n",
+ "\n",
+ "# Récupérez l'index de la colonne de référence\n",
+ "reference_index = df1_targets_full.columns.get_loc(\"target_type_name\")\n",
+ "\n",
+ "# Créez une copie de la colonne que vous voulez déplacer\n",
+ "column_copy = df1_targets_full[column_to_move].copy()\n",
+ "\n",
+ "# Supprimez la colonne d'origine\n",
+ "df1_targets_full = df1_targets_full.drop(column_to_move, axis=1)\n",
+ "\n",
+ "# Utilisez la méthode insert pour déplacer la colonne à la nouvelle position\n",
+ "df1_targets_full.insert(reference_index - 1, column_to_move, column_copy)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "a874514a-c7dc-42d4-a440-dedd3a270e24",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target_id | \n",
+ " target_name | \n",
+ " target_type_is_import | \n",
+ " target_type_id | \n",
+ " target_type_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 217 | \n",
+ " DDCP PROMO Art contemporain - salle de chauffe... | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 701 | \n",
+ " consentement optin scolaires | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 134 | \n",
+ " DDCP Newsletter jeune public | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 700 | \n",
+ " consentement optout scolaires | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 964 | \n",
+ " DDCP achat billet nbr dep 19052021 | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target_id target_name \\\n",
+ "0 217 DDCP PROMO Art contemporain - salle de chauffe... \n",
+ "1 701 consentement optin scolaires \n",
+ "2 134 DDCP Newsletter jeune public \n",
+ "3 700 consentement optout scolaires \n",
+ "4 964 DDCP achat billet nbr dep 19052021 \n",
+ "\n",
+ " target_type_is_import target_type_id target_type_name \n",
+ "0 False 56 manual_static_filter \n",
+ "1 False 56 manual_static_filter \n",
+ "2 False 56 manual_static_filter \n",
+ "3 False 56 manual_static_filter \n",
+ "4 False 56 manual_static_filter "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_targets_full = df1_targets_full.rename(columns=lambda x: 'target_' + x if not x.startswith('target_') else x)\n",
+ "df1_targets_full.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "0db0172a-5119-4b7f-97f8-36fc5c985205",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " customer_id | \n",
+ " target_id | \n",
+ " target_name | \n",
+ " target_type_is_import | \n",
+ " target_type_id | \n",
+ " target_type_name | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1184824 | \n",
+ " 645400 | \n",
+ " 130 | \n",
+ " DDCP PROMO Réseau livres | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1184825 | \n",
+ " 645400 | \n",
+ " 345 | \n",
+ " Inscrits NL générale site web | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1184828 | \n",
+ " 645402 | \n",
+ " 126 | \n",
+ " DDCP PROMO Art contemporain | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1184829 | \n",
+ " 645403 | \n",
+ " 126 | \n",
+ " DDCP PROMO Art contemporain | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1295770 | \n",
+ " 647301 | \n",
+ " 346 | \n",
+ " Votre première liste | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 768019 | \n",
+ " 2737545 | \n",
+ " 666983 | \n",
+ " 345 | \n",
+ " Inscrits NL générale site web | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 768020 | \n",
+ " 2737546 | \n",
+ " 666983 | \n",
+ " 346 | \n",
+ " Votre première liste | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 768021 | \n",
+ " 2737575 | \n",
+ " 666986 | \n",
+ " 346 | \n",
+ " Votre première liste | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 768022 | \n",
+ " 2737576 | \n",
+ " 666987 | \n",
+ " 345 | \n",
+ " Inscrits NL générale site web | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ " 768023 | \n",
+ " 2737577 | \n",
+ " 666987 | \n",
+ " 346 | \n",
+ " Votre première liste | \n",
+ " False | \n",
+ " 56 | \n",
+ " manual_static_filter | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
768024 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id customer_id target_id target_name \\\n",
+ "0 1184824 645400 130 DDCP PROMO Réseau livres \n",
+ "1 1184825 645400 345 Inscrits NL générale site web \n",
+ "2 1184828 645402 126 DDCP PROMO Art contemporain \n",
+ "3 1184829 645403 126 DDCP PROMO Art contemporain \n",
+ "4 1295770 647301 346 Votre première liste \n",
+ "... ... ... ... ... \n",
+ "768019 2737545 666983 345 Inscrits NL générale site web \n",
+ "768020 2737546 666983 346 Votre première liste \n",
+ "768021 2737575 666986 346 Votre première liste \n",
+ "768022 2737576 666987 345 Inscrits NL générale site web \n",
+ "768023 2737577 666987 346 Votre première liste \n",
+ "\n",
+ " target_type_is_import target_type_id target_type_name \n",
+ "0 False 56 manual_static_filter \n",
+ "1 False 56 manual_static_filter \n",
+ "2 False 56 manual_static_filter \n",
+ "3 False 56 manual_static_filter \n",
+ "4 False 56 manual_static_filter \n",
+ "... ... ... ... \n",
+ "768019 False 56 manual_static_filter \n",
+ "768020 False 56 manual_static_filter \n",
+ "768021 False 56 manual_static_filter \n",
+ "768022 False 56 manual_static_filter \n",
+ "768023 False 56 manual_static_filter \n",
+ "\n",
+ "[768024 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# finally, merge\n",
+ "\n",
+ "# pour df1_customer_target_mappings on enlève les colonnes name, extra_field, et updated_at (valeur égale à created_at)\n",
+ "# note : by making a left join on df1_customer_target_mappings, we suppress 2 targets that have no customer associated\n",
+ "\n",
+ "df1_customer_targets = pd.merge(df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]], \n",
+ " df1_targets_full, left_on='target_id', right_on='target_id', how='left')\n",
+ "df1_customer_targets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "52326267-c5ba-4e21-b8ab-4b4c62de75d1",
+ "metadata": {},
+ "source": [
+ "## Campaign stats, campaigns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "06dca910-5c07-4ee1-bbf2-3b11b48ba1f2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " service_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " process_id | \n",
+ " report_url | \n",
+ " category | \n",
+ " to_be_synced | \n",
+ " identifier | \n",
+ " sent_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1319613 | \n",
+ " newsletter enseignants janvier 2022 | \n",
+ " 721 | \n",
+ " 2022-01-14 16:06:42.586321+01:00 | \n",
+ " 2022-02-03 14:17:27.112963+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " aba3b6fd5d186d28e06ff97135cade7f | \n",
+ " 2022-01-14 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1319586 | \n",
+ " lsf_janvier_2022 | \n",
+ " 717 | \n",
+ " 2022-01-07 11:30:35.315895+01:00 | \n",
+ " 2022-02-03 14:17:27.116171+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 788d986905533aba051261497ecffcbb | \n",
+ " 2022-01-07 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1319282 | \n",
+ " Invitation à déjeuner au Mucem | Vernissage « ... | \n",
+ " 591 | \n",
+ " 2021-09-28 12:50:24.448752+02:00 | \n",
+ " 2022-02-03 14:17:27.119582+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 3493894fa4ea036cfc6433c3e2ee63b0 | \n",
+ " 2021-09-28 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1319283 | \n",
+ " Vacances de la Toussaint - centres des loisirs | \n",
+ " 590 | \n",
+ " 2021-09-28 18:01:04.692073+02:00 | \n",
+ " 2022-02-03 14:17:27.124408+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 08b255a5d42b89b0585260b6f2360bdd | \n",
+ " 2021-09-28 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1319636 | \n",
+ " ddcp_promo_md_livemag | \n",
+ " 730 | \n",
+ " 2022-01-27 18:00:41.053069+01:00 | \n",
+ " 2022-02-03 14:17:27.127607+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " d5cfead94f5350c12c322b5b664544c1 | \n",
+ " 2022-01-27 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name service_id \\\n",
+ "0 1319613 newsletter enseignants janvier 2022 721 \n",
+ "1 1319586 lsf_janvier_2022 717 \n",
+ "2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n",
+ "3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n",
+ "4 1319636 ddcp_promo_md_livemag 730 \n",
+ "\n",
+ " created_at updated_at \\\n",
+ "0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n",
+ "1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n",
+ "2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n",
+ "3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n",
+ "4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n",
+ "\n",
+ " process_id report_url category to_be_synced \\\n",
+ "0 NaN NaN 0.0 False \n",
+ "1 NaN NaN 0.0 False \n",
+ "2 NaN NaN 0.0 False \n",
+ "3 NaN NaN 0.0 False \n",
+ "4 NaN NaN 0.0 False \n",
+ "\n",
+ " identifier sent_at \n",
+ "0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n",
+ "1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n",
+ "2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n",
+ "3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n",
+ "4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 "
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 1. campaigns\n",
+ "df1_campaigns.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "83eaa447-9144-41ed-9e26-f0f23799a8fd",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " campaign_id | \n",
+ " customer_id | \n",
+ " opened_at | \n",
+ " sent_at | \n",
+ " delivered_at | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 19793 | \n",
+ " 58 | \n",
+ " 112597 | \n",
+ " NaN | \n",
+ " 2021-03-28 18:01:09+02:00 | \n",
+ " 2021-03-28 18:24:18+02:00 | \n",
+ " 2021-03-28 18:34:20.616136+02:00 | \n",
+ " 2022-04-15 22:52:04.397693+02:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 14211 | \n",
+ " 58 | \n",
+ " 113666 | \n",
+ " NaN | \n",
+ " 2021-03-28 18:01:09+02:00 | \n",
+ " 2021-03-28 18:21:02+02:00 | \n",
+ " 2021-03-28 18:21:04.297213+02:00 | \n",
+ " 2022-04-15 22:52:04.397693+02:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13150 | \n",
+ " 58 | \n",
+ " 280561 | \n",
+ " NaN | \n",
+ " 2021-03-28 18:00:59+02:00 | \n",
+ " 2021-03-28 18:08:45+02:00 | \n",
+ " 2021-03-28 18:18:49.991042+02:00 | \n",
+ " 2022-04-15 22:52:04.397693+02:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 7073 | \n",
+ " 58 | \n",
+ " 101007 | \n",
+ " 2021-03-28 20:11:06+02:00 | \n",
+ " 2021-03-28 18:00:59+02:00 | \n",
+ " 2021-03-28 18:09:47+02:00 | \n",
+ " 2021-03-28 18:09:50.915354+02:00 | \n",
+ " 2022-04-15 22:52:04.397693+02:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5175 | \n",
+ " 58 | \n",
+ " 103972 | \n",
+ " NaN | \n",
+ " 2021-03-28 18:01:06+02:00 | \n",
+ " 2021-03-28 18:05:03+02:00 | \n",
+ " 2021-03-28 18:05:08.507398+02:00 | \n",
+ " 2022-04-15 22:52:04.397693+02:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id campaign_id customer_id opened_at \\\n",
+ "0 19793 58 112597 NaN \n",
+ "1 14211 58 113666 NaN \n",
+ "2 13150 58 280561 NaN \n",
+ "3 7073 58 101007 2021-03-28 20:11:06+02:00 \n",
+ "4 5175 58 103972 NaN \n",
+ "\n",
+ " sent_at delivered_at \\\n",
+ "0 2021-03-28 18:01:09+02:00 2021-03-28 18:24:18+02:00 \n",
+ "1 2021-03-28 18:01:09+02:00 2021-03-28 18:21:02+02:00 \n",
+ "2 2021-03-28 18:00:59+02:00 2021-03-28 18:08:45+02:00 \n",
+ "3 2021-03-28 18:00:59+02:00 2021-03-28 18:09:47+02:00 \n",
+ "4 2021-03-28 18:01:06+02:00 2021-03-28 18:05:03+02:00 \n",
+ "\n",
+ " created_at updated_at \n",
+ "0 2021-03-28 18:34:20.616136+02:00 2022-04-15 22:52:04.397693+02:00 \n",
+ "1 2021-03-28 18:21:04.297213+02:00 2022-04-15 22:52:04.397693+02:00 \n",
+ "2 2021-03-28 18:18:49.991042+02:00 2022-04-15 22:52:04.397693+02:00 \n",
+ "3 2021-03-28 18:09:50.915354+02:00 2022-04-15 22:52:04.397693+02:00 \n",
+ "4 2021-03-28 18:05:08.507398+02:00 2022-04-15 22:52:04.397693+02:00 "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 2. campaigns stats\n",
+ "df1_campaign_stats.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "7f25eb1b-e7c8-4715-bc30-7ac29a7181ac",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " campaign_id | \n",
+ " customer_id | \n",
+ " opened_at | \n",
+ " sent_at | \n",
+ " delivered_at | \n",
+ " campaign_name | \n",
+ " campaign_service_id | \n",
+ " campaign_sent_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 19793 | \n",
+ " 58 | \n",
+ " 112597 | \n",
+ " NaN | \n",
+ " 2021-03-28 18:01:09+02:00 | \n",
+ " 2021-03-28 18:24:18+02:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-28 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 14211 | \n",
+ " 58 | \n",
+ " 113666 | \n",
+ " NaN | \n",
+ " 2021-03-28 18:01:09+02:00 | \n",
+ " 2021-03-28 18:21:02+02:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-28 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13150 | \n",
+ " 58 | \n",
+ " 280561 | \n",
+ " NaN | \n",
+ " 2021-03-28 18:00:59+02:00 | \n",
+ " 2021-03-28 18:08:45+02:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-28 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 7073 | \n",
+ " 58 | \n",
+ " 101007 | \n",
+ " 2021-03-28 20:11:06+02:00 | \n",
+ " 2021-03-28 18:00:59+02:00 | \n",
+ " 2021-03-28 18:09:47+02:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-28 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5175 | \n",
+ " 58 | \n",
+ " 103972 | \n",
+ " NaN | \n",
+ " 2021-03-28 18:01:06+02:00 | \n",
+ " 2021-03-28 18:05:03+02:00 | \n",
+ " Le Mucem chez vous, gardons le lien #22 | \n",
+ " 404 | \n",
+ " 2021-03-28 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id campaign_id customer_id opened_at \\\n",
+ "0 19793 58 112597 NaN \n",
+ "1 14211 58 113666 NaN \n",
+ "2 13150 58 280561 NaN \n",
+ "3 7073 58 101007 2021-03-28 20:11:06+02:00 \n",
+ "4 5175 58 103972 NaN \n",
+ "\n",
+ " sent_at delivered_at \\\n",
+ "0 2021-03-28 18:01:09+02:00 2021-03-28 18:24:18+02:00 \n",
+ "1 2021-03-28 18:01:09+02:00 2021-03-28 18:21:02+02:00 \n",
+ "2 2021-03-28 18:00:59+02:00 2021-03-28 18:08:45+02:00 \n",
+ "3 2021-03-28 18:00:59+02:00 2021-03-28 18:09:47+02:00 \n",
+ "4 2021-03-28 18:01:06+02:00 2021-03-28 18:05:03+02:00 \n",
+ "\n",
+ " campaign_name campaign_service_id \\\n",
+ "0 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "1 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "2 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "3 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "4 Le Mucem chez vous, gardons le lien #22 404 \n",
+ "\n",
+ " campaign_sent_at \n",
+ "0 2021-03-28 00:00:00+01:00 \n",
+ "1 2021-03-28 00:00:00+01:00 \n",
+ "2 2021-03-28 00:00:00+01:00 \n",
+ "3 2021-03-28 00:00:00+01:00 \n",
+ "4 2021-03-28 00:00:00+01:00 "
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# 3. merge campaigns and campaigns stats\n",
+ "\n",
+ "df1_campaigns_full = pd.merge(df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]], \n",
+ " df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\"),\n",
+ " on = \"campaign_id\", how = \"left\")\n",
+ "df1_campaigns_full.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "87fc686a-4a80-40ab-9987-20d2774f3055",
+ "metadata": {},
+ "source": [
+ "## Link stats"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "2f9df2d0-8a23-496b-8e92-617285f64530",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " clicked_at | \n",
+ " link_id | \n",
+ " customer_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2021-03-26 16:30:36+01:00 | \n",
+ " 1 | \n",
+ " 284033 | \n",
+ " 2021-03-26 15:30:37.050161+01:00 | \n",
+ " 2021-03-26 15:30:37.050161+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2021-03-26 17:16:34+01:00 | \n",
+ " 2 | \n",
+ " 119768 | \n",
+ " 2021-03-26 16:16:34.950871+01:00 | \n",
+ " 2021-03-26 16:16:34.950871+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 272 | \n",
+ " 2021-03-28 20:03:32+02:00 | \n",
+ " 42 | \n",
+ " 113105 | \n",
+ " 2021-03-28 18:03:32.736394+02:00 | \n",
+ " 2021-03-28 18:03:32.736394+02:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 2021-03-26 17:43:19+01:00 | \n",
+ " 3 | \n",
+ " 272280 | \n",
+ " 2021-03-26 16:43:19.338321+01:00 | \n",
+ " 2021-03-26 16:43:19.338321+01:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 2021-03-26 17:46:00+01:00 | \n",
+ " 3 | \n",
+ " 105095 | \n",
+ " 2021-03-26 16:46:00.502945+01:00 | \n",
+ " 2021-03-26 16:46:00.502945+01:00 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 151046 | \n",
+ " 243553 | \n",
+ " 2023-11-09 16:34:27+01:00 | \n",
+ " 14666 | \n",
+ " 998 | \n",
+ " 2023-11-09 15:34:29.425425+01:00 | \n",
+ " 2023-11-09 15:34:29.425425+01:00 | \n",
+ "
\n",
+ " \n",
+ " 151047 | \n",
+ " 243554 | \n",
+ " 2023-11-09 16:34:35+01:00 | \n",
+ " 14670 | \n",
+ " 998 | \n",
+ " 2023-11-09 15:34:37.505505+01:00 | \n",
+ " 2023-11-09 15:34:37.505505+01:00 | \n",
+ "
\n",
+ " \n",
+ " 151048 | \n",
+ " 243559 | \n",
+ " 2023-11-09 16:51:15+01:00 | \n",
+ " 14686 | \n",
+ " 82923 | \n",
+ " 2023-11-09 15:51:17.439518+01:00 | \n",
+ " 2023-11-09 15:51:17.439518+01:00 | \n",
+ "
\n",
+ " \n",
+ " 151049 | \n",
+ " 243561 | \n",
+ " 2023-11-09 16:59:42+01:00 | \n",
+ " 14677 | \n",
+ " 82923 | \n",
+ " 2023-11-09 15:59:44.030922+01:00 | \n",
+ " 2023-11-09 15:59:44.030922+01:00 | \n",
+ "
\n",
+ " \n",
+ " 151050 | \n",
+ " 243564 | \n",
+ " 2023-11-09 17:16:41+01:00 | \n",
+ " 14691 | \n",
+ " 1254355 | \n",
+ " 2023-11-09 16:16:43.012932+01:00 | \n",
+ " 2023-11-09 16:16:43.012932+01:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
151051 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id clicked_at link_id customer_id \\\n",
+ "0 1 2021-03-26 16:30:36+01:00 1 284033 \n",
+ "1 2 2021-03-26 17:16:34+01:00 2 119768 \n",
+ "2 272 2021-03-28 20:03:32+02:00 42 113105 \n",
+ "3 4 2021-03-26 17:43:19+01:00 3 272280 \n",
+ "4 5 2021-03-26 17:46:00+01:00 3 105095 \n",
+ "... ... ... ... ... \n",
+ "151046 243553 2023-11-09 16:34:27+01:00 14666 998 \n",
+ "151047 243554 2023-11-09 16:34:35+01:00 14670 998 \n",
+ "151048 243559 2023-11-09 16:51:15+01:00 14686 82923 \n",
+ "151049 243561 2023-11-09 16:59:42+01:00 14677 82923 \n",
+ "151050 243564 2023-11-09 17:16:41+01:00 14691 1254355 \n",
+ "\n",
+ " created_at updated_at \n",
+ "0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n",
+ "1 2021-03-26 16:16:34.950871+01:00 2021-03-26 16:16:34.950871+01:00 \n",
+ "2 2021-03-28 18:03:32.736394+02:00 2021-03-28 18:03:32.736394+02:00 \n",
+ "3 2021-03-26 16:43:19.338321+01:00 2021-03-26 16:43:19.338321+01:00 \n",
+ "4 2021-03-26 16:46:00.502945+01:00 2021-03-26 16:46:00.502945+01:00 \n",
+ "... ... ... \n",
+ "151046 2023-11-09 15:34:29.425425+01:00 2023-11-09 15:34:29.425425+01:00 \n",
+ "151047 2023-11-09 15:34:37.505505+01:00 2023-11-09 15:34:37.505505+01:00 \n",
+ "151048 2023-11-09 15:51:17.439518+01:00 2023-11-09 15:51:17.439518+01:00 \n",
+ "151049 2023-11-09 15:59:44.030922+01:00 2023-11-09 15:59:44.030922+01:00 \n",
+ "151050 2023-11-09 16:16:43.012932+01:00 2023-11-09 16:16:43.012932+01:00 \n",
+ "\n",
+ "[151051 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_link_stats"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aad6fb14-9694-4c1e-9885-1ebe0f38afe3",
+ "metadata": {},
+ "source": [
+ "## Bonus : peut-on lier link stats et campaign ? Non, les dates à laquelle le client clique sur le lie/ouvre la campagne ne permettent pas de faire coincider link_id et campaign_id"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "8be7c974-72c9-4e31-a874-d7e5d2719fb3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " clicked_at | \n",
+ " link_id | \n",
+ " customer_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2021-03-26 16:30:36+01:00 | \n",
+ " 1 | \n",
+ " 284033 | \n",
+ " 2021-03-26 15:30:37.050161+01:00 | \n",
+ " 2021-03-26 15:30:37.050161+01:00 | \n",
+ "
\n",
+ " \n",
+ " 7526 | \n",
+ " 14018 | \n",
+ " 2021-05-10 18:07:59+02:00 | \n",
+ " 312 | \n",
+ " 284033 | \n",
+ " 2021-05-10 16:08:00.541322+02:00 | \n",
+ " 2021-05-10 16:08:00.541322+02:00 | \n",
+ "
\n",
+ " \n",
+ " 96848 | \n",
+ " 133449 | \n",
+ " 2021-03-25 08:42:22+01:00 | \n",
+ " 4 | \n",
+ " 284033 | \n",
+ " 2022-04-15 22:51:01.994343+02:00 | \n",
+ " 2022-04-15 22:51:01.994343+02:00 | \n",
+ "
\n",
+ " \n",
+ " 115728 | \n",
+ " 207544 | \n",
+ " 2022-08-23 10:33:04+02:00 | \n",
+ " 12365 | \n",
+ " 284033 | \n",
+ " 2022-08-23 08:33:06.498908+02:00 | \n",
+ " 2022-08-23 08:33:06.498908+02:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id clicked_at link_id customer_id \\\n",
+ "0 1 2021-03-26 16:30:36+01:00 1 284033 \n",
+ "7526 14018 2021-05-10 18:07:59+02:00 312 284033 \n",
+ "96848 133449 2021-03-25 08:42:22+01:00 4 284033 \n",
+ "115728 207544 2022-08-23 10:33:04+02:00 12365 284033 \n",
+ "\n",
+ " created_at updated_at \n",
+ "0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n",
+ "7526 2021-05-10 16:08:00.541322+02:00 2021-05-10 16:08:00.541322+02:00 \n",
+ "96848 2022-04-15 22:51:01.994343+02:00 2022-04-15 22:51:01.994343+02:00 \n",
+ "115728 2022-08-23 08:33:06.498908+02:00 2022-08-23 08:33:06.498908+02:00 "
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_link_stats[df1_link_stats[\"customer_id\"] == 284033]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "id": "902e9947-58e1-44f4-b634-1239b0e4df02",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " campaign_id | \n",
+ " customer_id | \n",
+ " opened_at | \n",
+ " sent_at | \n",
+ " delivered_at | \n",
+ " campaign_name | \n",
+ " campaign_service_id | \n",
+ " campaign_sent_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 4030643 | \n",
+ " 4036376 | \n",
+ " 4 | \n",
+ " 284033 | \n",
+ " NaN | \n",
+ " 2021-03-21 18:01:22+01:00 | \n",
+ " 2021-03-21 18:08:04+01:00 | \n",
+ " Le Mucem chez vous, gardons le lien #21 | \n",
+ " 398 | \n",
+ " 2021-03-21 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id campaign_id customer_id opened_at \\\n",
+ "4030643 4036376 4 284033 NaN \n",
+ "\n",
+ " sent_at delivered_at \\\n",
+ "4030643 2021-03-21 18:01:22+01:00 2021-03-21 18:08:04+01:00 \n",
+ "\n",
+ " campaign_name campaign_service_id \\\n",
+ "4030643 Le Mucem chez vous, gardons le lien #21 398 \n",
+ "\n",
+ " campaign_sent_at \n",
+ "4030643 2021-03-21 00:00:00+01:00 "
+ ]
+ },
+ "execution_count": 82,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df1_campaigns_full[ (df1_campaigns_full[\"customer_id\"] == 284033) & (df1_campaigns_full[\"campaign_id\"] == 4)]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Traitement_Fanta.ipynb b/Traitement_Fanta.ipynb
index a456ad0..5e3529d 100644
--- a/Traitement_Fanta.ipynb
+++ b/Traitement_Fanta.ipynb
@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"id": "b6035982-9ff4-4013-9792-2d50e10db3d1",
"metadata": {},
"outputs": [
@@ -66,7 +66,7 @@
" 'bdc2324-data/1/1type_ofs.csv']"
]
},
- "execution_count": 3,
+ "execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -84,7 +84,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 3,
"id": "b86c935d-124f-453f-80dd-83ea6770d09c",
"metadata": {},
"outputs": [],
@@ -94,7 +94,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 4,
"id": "f6d0b27c-0ecd-406b-b042-6c3802dd68fd",
"metadata": {},
"outputs": [
@@ -102,7 +102,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "/tmp/ipykernel_1054/1008972637.py:5: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ "/tmp/ipykernel_447/1008972637.py:5: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n"
]
}
@@ -117,7 +117,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 5,
"id": "2a6b5e22-3370-457f-83b7-dd1e13663229",
"metadata": {},
"outputs": [
@@ -127,7 +127,7 @@
"'bdc2324-data/1/1type_ofs.csv'"
]
},
- "execution_count": 11,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -136,6 +136,22 @@
"FILE_PATH_S3_fanta"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "79012186-ea51-4252-843e-36a9bbe3847e",
+ "metadata": {},
+ "source": [
+ "# Analyse exploratoire "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a365f29-4766-47d8-9796-24a5271867b2",
+ "metadata": {},
+ "source": [
+ "## I. Base type_of_pricing_formulas"
+ ]
+ },
{
"cell_type": "markdown",
"id": "bcc14f93-2289-44eb-816b-a51049b258df",
@@ -145,21 +161,17 @@
]
},
{
- "cell_type": "code",
- "execution_count": 12,
- "id": "7f8083ec-3d08-4c4e-8d26-a5a4948c1c02",
+ "cell_type": "raw",
+ "id": "ab2ec4c4-9d38-4aeb-8202-9116df3cdd66",
"metadata": {},
- "outputs": [],
"source": [
"dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']"
]
},
{
- "cell_type": "code",
- "execution_count": 16,
- "id": "a6de36fa-3d35-4b20-97f2-3e24d54c7f99",
+ "cell_type": "markdown",
+ "id": "88759b4a-2633-478d-abce-29abeac376d1",
"metadata": {},
- "outputs": [],
"source": [
"def verifier_donnees_manquantes(base):\n",
" donnees_manquantes = base.isna().sum()\n",
@@ -168,24 +180,9 @@
]
},
{
- "cell_type": "code",
- "execution_count": 17,
- "id": "1c261736-11fb-44f4-a4b1-830cae755a65",
+ "cell_type": "markdown",
+ "id": "df3075b4-1490-4cf2-a3fe-c6d4e2144ae3",
"metadata": {},
- "outputs": [
- {
- "ename": "AttributeError",
- "evalue": "'str' object has no attribute 'isna'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[17], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m nom_base \u001b[38;5;129;01min\u001b[39;00m dic_prod_princing:\n\u001b[0;32m----> 2\u001b[0m \u001b[43mverifier_donnees_manquantes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnom_base\u001b[49m\u001b[43m)\u001b[49m\n",
- "Cell \u001b[0;32mIn[16], line 2\u001b[0m, in \u001b[0;36mverifier_donnees_manquantes\u001b[0;34m(base)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mverifier_donnees_manquantes\u001b[39m(base):\n\u001b[0;32m----> 2\u001b[0m donnees_manquantes \u001b[38;5;241m=\u001b[39m \u001b[43mbase\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43misna\u001b[49m()\u001b[38;5;241m.\u001b[39msum()\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDonnées manquantes pour la base :\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(donnees_manquantes)\n",
- "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'isna'"
- ]
- }
- ],
"source": [
"for nom_base in dic_prod_princing:\n",
" verifier_donnees_manquantes(nom_base)"
@@ -193,7 +190,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 6,
"id": "e0c67c01-e837-4772-b070-d1be0d895a36",
"metadata": {},
"outputs": [
@@ -209,20 +206,1492 @@
"dtype: int64"
]
},
- "execution_count": 14,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
+ "#detection des Nan d\n",
+ "\n",
"type_of_pricing_formulas.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "83a6a48d-effe-4537-b4bb-d5a540b610f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#variable retenu:[[\"id\",\"type_of_id\",\"pricing_formula_id\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "3eaffaa6-1164-4ee9-a671-8b5eb3df797d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " type_of_id | \n",
+ " pricing_formula_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 127 | \n",
+ " 2021-01-05 11:55:51.226960+01:00 | \n",
+ " 2021-01-05 11:55:51.226960+01:00 | \n",
+ " cf2918b25e6dcf8c30798ca05c8ec8ed | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2425 | \n",
+ " 2021-01-05 11:55:51.235606+01:00 | \n",
+ " 2021-01-05 11:55:51.235606+01:00 | \n",
+ " 2c8ee3f7c1487d792b6c946314e681f2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2937 | \n",
+ " 2021-01-05 11:55:51.240114+01:00 | \n",
+ " 2021-01-05 11:55:51.240114+01:00 | \n",
+ " 44e55c85e4eb59b3c3c01c137a6b25fc | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 48 | \n",
+ " 2021-01-05 11:55:51.244638+01:00 | \n",
+ " 2021-01-05 11:55:51.244638+01:00 | \n",
+ " ee3bb93b7e2217cd86a49d547fedf6c6 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 2021-01-05 11:55:51.249409+01:00 | \n",
+ " 2021-01-05 11:55:51.249409+01:00 | \n",
+ " ae701668574f1a653d2b21ddfd250620 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 563 | \n",
+ " 564 | \n",
+ " 4 | \n",
+ " 6656 | \n",
+ " 2022-02-18 16:15:58.872249+01:00 | \n",
+ " 2022-02-18 16:15:58.872249+01:00 | \n",
+ " f669824cdca9de9697f07ff3ba365a8d | \n",
+ "
\n",
+ " \n",
+ " 564 | \n",
+ " 565 | \n",
+ " 4 | \n",
+ " 6607 | \n",
+ " 2022-02-18 16:15:59.231018+01:00 | \n",
+ " 2022-02-18 16:15:59.231018+01:00 | \n",
+ " 6421c8146a598758139153b0e7b921ea | \n",
+ "
\n",
+ " \n",
+ " 565 | \n",
+ " 566 | \n",
+ " 4 | \n",
+ " 6700 | \n",
+ " 2022-02-18 16:15:59.724812+01:00 | \n",
+ " 2022-02-18 16:15:59.724812+01:00 | \n",
+ " 6823f6d4d80b322fbfb8b83545a9f96d | \n",
+ "
\n",
+ " \n",
+ " 566 | \n",
+ " 567 | \n",
+ " 4 | \n",
+ " 8118 | \n",
+ " 2022-02-18 16:16:00.163381+01:00 | \n",
+ " 2022-02-18 16:16:00.163381+01:00 | \n",
+ " 35cfc12584b4d1b94795d97fd0aa56e8 | \n",
+ "
\n",
+ " \n",
+ " 567 | \n",
+ " 569 | \n",
+ " 7 | \n",
+ " 48157 | \n",
+ " 2023-03-13 11:30:29.480161+01:00 | \n",
+ " 2023-03-13 11:30:29.480161+01:00 | \n",
+ " 55863541f33fd229ac9b54d9ec1f4874 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
568 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id type_of_id pricing_formula_id created_at \\\n",
+ "0 1 1 127 2021-01-05 11:55:51.226960+01:00 \n",
+ "1 2 1 2425 2021-01-05 11:55:51.235606+01:00 \n",
+ "2 3 1 2937 2021-01-05 11:55:51.240114+01:00 \n",
+ "3 4 1 48 2021-01-05 11:55:51.244638+01:00 \n",
+ "4 5 1 7 2021-01-05 11:55:51.249409+01:00 \n",
+ ".. ... ... ... ... \n",
+ "563 564 4 6656 2022-02-18 16:15:58.872249+01:00 \n",
+ "564 565 4 6607 2022-02-18 16:15:59.231018+01:00 \n",
+ "565 566 4 6700 2022-02-18 16:15:59.724812+01:00 \n",
+ "566 567 4 8118 2022-02-18 16:16:00.163381+01:00 \n",
+ "567 569 7 48157 2023-03-13 11:30:29.480161+01:00 \n",
+ "\n",
+ " updated_at identifier \n",
+ "0 2021-01-05 11:55:51.226960+01:00 cf2918b25e6dcf8c30798ca05c8ec8ed \n",
+ "1 2021-01-05 11:55:51.235606+01:00 2c8ee3f7c1487d792b6c946314e681f2 \n",
+ "2 2021-01-05 11:55:51.240114+01:00 44e55c85e4eb59b3c3c01c137a6b25fc \n",
+ "3 2021-01-05 11:55:51.244638+01:00 ee3bb93b7e2217cd86a49d547fedf6c6 \n",
+ "4 2021-01-05 11:55:51.249409+01:00 ae701668574f1a653d2b21ddfd250620 \n",
+ ".. ... ... \n",
+ "563 2022-02-18 16:15:58.872249+01:00 f669824cdca9de9697f07ff3ba365a8d \n",
+ "564 2022-02-18 16:15:59.231018+01:00 6421c8146a598758139153b0e7b921ea \n",
+ "565 2022-02-18 16:15:59.724812+01:00 6823f6d4d80b322fbfb8b83545a9f96d \n",
+ "566 2022-02-18 16:16:00.163381+01:00 35cfc12584b4d1b94795d97fd0aa56e8 \n",
+ "567 2023-03-13 11:30:29.480161+01:00 55863541f33fd229ac9b54d9ec1f4874 \n",
+ "\n",
+ "[568 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "type_of_pricing_formulas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
"id": "57298669-8d55-40d5-a5aa-4c5df984eec7",
"metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "type_of_id int64\n",
+ "pricing_formula_id int64\n",
+ "created_at object\n",
+ "updated_at object\n",
+ "identifier object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#type des variables\n",
+ "\n",
+ "type_of_pricing_formulas.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "c11850cb-8833-44c0-a11d-9695d620a42b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " type_of_id | \n",
+ " pricing_formula_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [id, type_of_id, pricing_formula_id, created_at, updated_at, identifier]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Identification des doublons\n",
+ "type_of_pricing_formulas.loc[type_of_pricing_formulas['id'].duplicated(keep=False),:]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7a40de03-5e18-4d3d-a0f8-da960c29fad8",
+ "metadata": {},
+ "source": [
+ "## II.products_groups"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "89909175-6734-4e8e-8632-d6f8ca812388",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0\n",
+ "percent_price 0\n",
+ "max_price 0\n",
+ "min_price 0\n",
+ "category_id 0\n",
+ "pricing_formula_id 0\n",
+ "representation_id 0\n",
+ "created_at 0\n",
+ "updated_at 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#detection des Nan \n",
+ "\n",
+ "products_groups.isna().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e0518684-c83c-4f0a-89ea-d7dcfd60051d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#variable retenu:[[\"id\",\"percent_price\",\"max_price\",\"min_price\",\"category_id\",\"pricing_formula_id\",\"representation_id\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "6a187170-96c4-48d2-9568-b270f67e2c27",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "percent_price float64\n",
+ "max_price float64\n",
+ "min_price float64\n",
+ "category_id int64\n",
+ "pricing_formula_id int64\n",
+ "representation_id int64\n",
+ "created_at object\n",
+ "updated_at object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#type des variables\n",
+ "\n",
+ "products_groups.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "2fba2cb0-a6a4-43b2-a854-3be07939c28b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " percent_price | \n",
+ " max_price | \n",
+ " min_price | \n",
+ " category_id | \n",
+ " pricing_formula_id | \n",
+ " representation_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [id, percent_price, max_price, min_price, category_id, pricing_formula_id, representation_id, created_at, updated_at]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Identification des doublons\n",
+ "products_groups.loc[products_groups[['id','pricing_formula_id','representation_id']].duplicated(keep=False),:]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5312ac13-8fbd-4c3f-a98a-8c28f079a599",
+ "metadata": {},
+ "source": [
+ "## III.pricing_formulas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "3383a773-0817-4b23-84e7-8d5d0c74b179",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " extra_field | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 41909 | \n",
+ " visite mécènes 1h30 | \n",
+ " 2022-07-08 07:08:26.802266+02:00 | \n",
+ " 2022-07-08 07:08:26.802266+02:00 | \n",
+ " NaN | \n",
+ " 21d4b0043c12b21952b0797d140991a1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 502 | \n",
+ " entree mucem tp( expo picasso) | \n",
+ " 2020-09-03 13:43:59.816765+02:00 | \n",
+ " 2022-02-18 15:57:55.792581+01:00 | \n",
+ " NaN | \n",
+ " 223b09e6c3f1f75dbf8df019af97a555 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 504 | \n",
+ " nombre de personnes cinema | \n",
+ " 2020-09-03 13:43:59.818198+02:00 | \n",
+ " 2021-01-25 19:16:05.187114+01:00 | \n",
+ " NaN | \n",
+ " ba33b7b6d225a75d713a356b49c4d915 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 117 | \n",
+ " spectacle tarif e famille tr | \n",
+ " 2020-09-03 13:21:21.400249+02:00 | \n",
+ " 2023-03-13 11:30:29.525335+01:00 | \n",
+ " NaN | \n",
+ " a00b61ad933518856f86e63ca91a5750 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1496 | \n",
+ " billet nb famille mecene 1a | \n",
+ " 2020-09-03 14:29:33.320952+02:00 | \n",
+ " 2021-01-25 19:23:06.816402+01:00 | \n",
+ " NaN | \n",
+ " 7f6013803c242253a5ccde80f780984f | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 551 | \n",
+ " 529 | \n",
+ " billet nb expo gr | \n",
+ " 2020-09-03 13:43:59.835944+02:00 | \n",
+ " 2022-02-18 15:57:55.792581+01:00 | \n",
+ " NaN | \n",
+ " 7d888e42abe101fc8b21dc88948c8b74 | \n",
+ "
\n",
+ " \n",
+ " 552 | \n",
+ " 3153 | \n",
+ " nb pers visite scolaire rep | \n",
+ " 2020-09-03 16:32:37.068864+02:00 | \n",
+ " 2022-02-18 15:57:55.792581+01:00 | \n",
+ " NaN | \n",
+ " 3cf21731c25eee650d5b232ee4780563 | \n",
+ "
\n",
+ " \n",
+ " 553 | \n",
+ " 5847 | \n",
+ " visite scolaire rep1h00 | \n",
+ " 2021-06-09 18:10:49.742531+02:00 | \n",
+ " 2022-02-18 15:55:03.576236+01:00 | \n",
+ " NaN | \n",
+ " a7bb5a6892d55f0d5ee4ce5786ae5fc6 | \n",
+ "
\n",
+ " \n",
+ " 554 | \n",
+ " 5840 | \n",
+ " france billet - entree ts | \n",
+ " 2021-06-09 18:10:49.737576+02:00 | \n",
+ " 2022-02-18 16:16:00.199543+01:00 | \n",
+ " NaN | \n",
+ " 4c53016fc65847646f600eff853593e5 | \n",
+ "
\n",
+ " \n",
+ " 555 | \n",
+ " 5863 | \n",
+ " france billet - entree tp | \n",
+ " 2021-06-09 18:12:49.269924+02:00 | \n",
+ " 2022-02-18 16:16:00.199543+01:00 | \n",
+ " NaN | \n",
+ " 90e642c0e1ef6bc9f2bc43089798de00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
556 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n",
+ "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n",
+ "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n",
+ "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n",
+ "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n",
+ ".. ... ... ... \n",
+ "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n",
+ "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n",
+ "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n",
+ "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n",
+ "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n",
+ "\n",
+ " updated_at extra_field \\\n",
+ "0 2022-07-08 07:08:26.802266+02:00 NaN \n",
+ "1 2022-02-18 15:57:55.792581+01:00 NaN \n",
+ "2 2021-01-25 19:16:05.187114+01:00 NaN \n",
+ "3 2023-03-13 11:30:29.525335+01:00 NaN \n",
+ "4 2021-01-25 19:23:06.816402+01:00 NaN \n",
+ ".. ... ... \n",
+ "551 2022-02-18 15:57:55.792581+01:00 NaN \n",
+ "552 2022-02-18 15:57:55.792581+01:00 NaN \n",
+ "553 2022-02-18 15:55:03.576236+01:00 NaN \n",
+ "554 2022-02-18 16:16:00.199543+01:00 NaN \n",
+ "555 2022-02-18 16:16:00.199543+01:00 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 21d4b0043c12b21952b0797d140991a1 \n",
+ "1 223b09e6c3f1f75dbf8df019af97a555 \n",
+ "2 ba33b7b6d225a75d713a356b49c4d915 \n",
+ "3 a00b61ad933518856f86e63ca91a5750 \n",
+ "4 7f6013803c242253a5ccde80f780984f \n",
+ ".. ... \n",
+ "551 7d888e42abe101fc8b21dc88948c8b74 \n",
+ "552 3cf21731c25eee650d5b232ee4780563 \n",
+ "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n",
+ "554 4c53016fc65847646f600eff853593e5 \n",
+ "555 90e642c0e1ef6bc9f2bc43089798de00 \n",
+ "\n",
+ "[556 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pricing_formulas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "d8130c73-6c5f-45b1-93ae-db7679c8ca56",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0.0\n",
+ "name 0.0\n",
+ "created_at 0.0\n",
+ "updated_at 0.0\n",
+ "extra_field 1.0\n",
+ "identifier 0.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#detection des Nan \n",
+ "\n",
+ "pricing_formulas.isna().sum()/pricing_formulas.shape[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9f2909c1-bc6a-443f-a077-84f6ce6b7ab5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#variable retenu: [[\"id\",\"name\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "44f1dbfd-c3cf-464b-9877-f37fcc61da92",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "name object\n",
+ "created_at object\n",
+ "updated_at object\n",
+ "extra_field float64\n",
+ "identifier object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#type des variables\n",
+ "\n",
+ "pricing_formulas.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "6784b41b-da74-4fae-832e-16641ae710c1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " extra_field | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [id, name, created_at, updated_at, extra_field, identifier]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Identification des doublons\n",
+ "pricing_formulas.loc[pricing_formulas[['id']].duplicated(keep=False),:]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2145b0a4-b73d-4530-8c12-a78b1cf86eae",
+ "metadata": {},
+ "source": [
+ "## IV. product_packs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "e36b07a7-4f0b-4711-86a0-12a1d8158eef",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0.0\n",
+ "name 1.0\n",
+ "type_of 0.0\n",
+ "created_at 0.0\n",
+ "updated_at 0.0\n",
+ "identifier 0.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#detection des Nan \n",
+ "\n",
+ "product_packs.isna().sum()/product_packs.shape[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e0887a01-51ea-4034-84fe-dc4dbf2ad949",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#variable retenu:[[\"id\",\"name\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "8707396a-f86b-476d-a9f9-c39f8de1d02e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "name float64\n",
+ "type_of int64\n",
+ "created_at object\n",
+ "updated_at object\n",
+ "identifier object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#type des variables\n",
+ "\n",
+ "product_packs.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "4b102bd3-924b-43da-8915-be7664c23f97",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " type_of | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [id, name, type_of, created_at, updated_at, identifier]\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Identification des doublons\n",
+ "product_packs.loc[product_packs[['id']].duplicated(keep=False),:]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cfe0c525-896b-4731-b38e-306ff6ea0c65",
+ "metadata": {},
+ "source": [
+ "## V.products"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "968beb24-f70c-4eb6-8b1e-4b04bc7fe9c9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0.0\n",
+ "amount 0.0\n",
+ "is_full_price 0.0\n",
+ "representation_id 0.0\n",
+ "pricing_formula_id 0.0\n",
+ "created_at 0.0\n",
+ "updated_at 0.0\n",
+ "category_id 0.0\n",
+ "apply_price 0.0\n",
+ "products_group_id 0.0\n",
+ "product_pack_id 0.0\n",
+ "extra_field 1.0\n",
+ "amount_consumption 1.0\n",
+ "identifier 0.0\n",
+ "dtype: float64"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#detection des Nan \n",
+ "\n",
+ "products.isna().sum()/products.shape[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "15bc6ac6-67e8-4e2c-9641-7ee8bb2581a3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id int64\n",
+ "amount float64\n",
+ "is_full_price bool\n",
+ "representation_id int64\n",
+ "pricing_formula_id int64\n",
+ "created_at object\n",
+ "updated_at object\n",
+ "category_id int64\n",
+ "apply_price float64\n",
+ "products_group_id int64\n",
+ "product_pack_id int64\n",
+ "extra_field float64\n",
+ "amount_consumption float64\n",
+ "identifier object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#type des variables\n",
+ "\n",
+ "products.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "7daa4f1a-e429-4daf-a2e1-1e311b487e09",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "dc12b746-6708-4708-826a-acb5a8e665a1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " extra_field | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 41909 | \n",
+ " visite mécènes 1h30 | \n",
+ " 2022-07-08 07:08:26.802266+02:00 | \n",
+ " 2022-07-08 07:08:26.802266+02:00 | \n",
+ " NaN | \n",
+ " 21d4b0043c12b21952b0797d140991a1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 502 | \n",
+ " entree mucem tp( expo picasso) | \n",
+ " 2020-09-03 13:43:59.816765+02:00 | \n",
+ " 2022-02-18 15:57:55.792581+01:00 | \n",
+ " NaN | \n",
+ " 223b09e6c3f1f75dbf8df019af97a555 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 504 | \n",
+ " nombre de personnes cinema | \n",
+ " 2020-09-03 13:43:59.818198+02:00 | \n",
+ " 2021-01-25 19:16:05.187114+01:00 | \n",
+ " NaN | \n",
+ " ba33b7b6d225a75d713a356b49c4d915 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 117 | \n",
+ " spectacle tarif e famille tr | \n",
+ " 2020-09-03 13:21:21.400249+02:00 | \n",
+ " 2023-03-13 11:30:29.525335+01:00 | \n",
+ " NaN | \n",
+ " a00b61ad933518856f86e63ca91a5750 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1496 | \n",
+ " billet nb famille mecene 1a | \n",
+ " 2020-09-03 14:29:33.320952+02:00 | \n",
+ " 2021-01-25 19:23:06.816402+01:00 | \n",
+ " NaN | \n",
+ " 7f6013803c242253a5ccde80f780984f | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 551 | \n",
+ " 529 | \n",
+ " billet nb expo gr | \n",
+ " 2020-09-03 13:43:59.835944+02:00 | \n",
+ " 2022-02-18 15:57:55.792581+01:00 | \n",
+ " NaN | \n",
+ " 7d888e42abe101fc8b21dc88948c8b74 | \n",
+ "
\n",
+ " \n",
+ " 552 | \n",
+ " 3153 | \n",
+ " nb pers visite scolaire rep | \n",
+ " 2020-09-03 16:32:37.068864+02:00 | \n",
+ " 2022-02-18 15:57:55.792581+01:00 | \n",
+ " NaN | \n",
+ " 3cf21731c25eee650d5b232ee4780563 | \n",
+ "
\n",
+ " \n",
+ " 553 | \n",
+ " 5847 | \n",
+ " visite scolaire rep1h00 | \n",
+ " 2021-06-09 18:10:49.742531+02:00 | \n",
+ " 2022-02-18 15:55:03.576236+01:00 | \n",
+ " NaN | \n",
+ " a7bb5a6892d55f0d5ee4ce5786ae5fc6 | \n",
+ "
\n",
+ " \n",
+ " 554 | \n",
+ " 5840 | \n",
+ " france billet - entree ts | \n",
+ " 2021-06-09 18:10:49.737576+02:00 | \n",
+ " 2022-02-18 16:16:00.199543+01:00 | \n",
+ " NaN | \n",
+ " 4c53016fc65847646f600eff853593e5 | \n",
+ "
\n",
+ " \n",
+ " 555 | \n",
+ " 5863 | \n",
+ " france billet - entree tp | \n",
+ " 2021-06-09 18:12:49.269924+02:00 | \n",
+ " 2022-02-18 16:16:00.199543+01:00 | \n",
+ " NaN | \n",
+ " 90e642c0e1ef6bc9f2bc43089798de00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
556 rows × 6 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name created_at \\\n",
+ "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n",
+ "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n",
+ "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n",
+ "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n",
+ "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n",
+ ".. ... ... ... \n",
+ "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n",
+ "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n",
+ "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n",
+ "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n",
+ "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n",
+ "\n",
+ " updated_at extra_field \\\n",
+ "0 2022-07-08 07:08:26.802266+02:00 NaN \n",
+ "1 2022-02-18 15:57:55.792581+01:00 NaN \n",
+ "2 2021-01-25 19:16:05.187114+01:00 NaN \n",
+ "3 2023-03-13 11:30:29.525335+01:00 NaN \n",
+ "4 2021-01-25 19:23:06.816402+01:00 NaN \n",
+ ".. ... ... \n",
+ "551 2022-02-18 15:57:55.792581+01:00 NaN \n",
+ "552 2022-02-18 15:57:55.792581+01:00 NaN \n",
+ "553 2022-02-18 15:55:03.576236+01:00 NaN \n",
+ "554 2022-02-18 16:16:00.199543+01:00 NaN \n",
+ "555 2022-02-18 16:16:00.199543+01:00 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 21d4b0043c12b21952b0797d140991a1 \n",
+ "1 223b09e6c3f1f75dbf8df019af97a555 \n",
+ "2 ba33b7b6d225a75d713a356b49c4d915 \n",
+ "3 a00b61ad933518856f86e63ca91a5750 \n",
+ "4 7f6013803c242253a5ccde80f780984f \n",
+ ".. ... \n",
+ "551 7d888e42abe101fc8b21dc88948c8b74 \n",
+ "552 3cf21731c25eee650d5b232ee4780563 \n",
+ "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n",
+ "554 4c53016fc65847646f600eff853593e5 \n",
+ "555 90e642c0e1ef6bc9f2bc43089798de00 \n",
+ "\n",
+ "[556 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pricing_formulas"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "46aad10f-8530-410e-872b-bb253c553a46",
+ "metadata": {},
+ "source": [
+ "# jointure entre les bases"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a4c3edd1-6d58-4c57-b3e4-0ef3529f6b8c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "id": "eac537e1-bbad-45bc-a85c-12b675da1088",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Merge1 entre products et pricing_formulas\n",
+ "base1=products.merge(pricing_formulas, how='left', left_on= 'pricing_formula_id', right_on= 'id', suffixes = (\"_products\", \"_pricing_formula\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "id": "75be3a30-3114-432d-87d6-697533c3c871",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Merge2 entre base1 et products_groups\n",
+ "base2=base1.merge(products_groups, how='left', left_on= 'id_pricing_formula', right_on= 'id', suffixes = (\"_merge2\", \"_product_group\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "id": "34a169c6-07a8-4ac3-a9e1-d7e7461f7310",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Merge3 entre base2 et type_of_pricing_formulas\n",
+ "base3=base2.merge(type_of_pricing_formulas, how='left', left_on= 'id_pricing_formula', right_on= 'pricing_formula_id', suffixes = (\"_merge3\", \"_type_of_pricing_f\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "id": "f44f40d2-5304-4931-b7e6-fcc06b2657b6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#Merge4 entre base3 et type_of_pricing_formulas\n",
+ "df_product_pricing=base3.merge(product_packs, how='left', left_on= 'product_pack_id', right_on= 'id', suffixes = (\"_merge4\", \"_product_pack\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "id": "a28772c3-7bc1-46b4-acc8-1388dc60ec98",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id_products | \n",
+ " amount | \n",
+ " is_full_price | \n",
+ " representation_id_merge2 | \n",
+ " pricing_formula_id_merge2 | \n",
+ " created_at_products | \n",
+ " updated_at_products | \n",
+ " category_id_merge2 | \n",
+ " apply_price | \n",
+ " products_group_id | \n",
+ " ... | \n",
+ " pricing_formula_id | \n",
+ " created_at_type_of_pricing_f | \n",
+ " updated_at_type_of_pricing_f | \n",
+ " identifier_merge4 | \n",
+ " id | \n",
+ " name_product_pack | \n",
+ " type_of | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " identifier_product_pack | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10682 | \n",
+ " 9.0 | \n",
+ " False | \n",
+ " 914 | \n",
+ " 114 | \n",
+ " 2020-09-03 14:09:43.119798+02:00 | \n",
+ " 2020-09-03 14:09:43.119798+02:00 | \n",
+ " 41 | \n",
+ " 0.0 | \n",
+ " 10655 | \n",
+ " ... | \n",
+ " 114.0 | \n",
+ " 2021-02-15 17:02:27.395376+01:00 | \n",
+ " 2021-02-15 17:02:27.395376+01:00 | \n",
+ " 3706121eb9f43b635bef1433c06f679c | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " a764b4bf13a360c7ac2a35ec4ca96c95 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 478 | \n",
+ " 9.5 | \n",
+ " False | \n",
+ " 273 | \n",
+ " 131 | \n",
+ " 2020-09-03 13:21:22.711773+02:00 | \n",
+ " 2020-09-03 13:21:22.711773+02:00 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 471 | \n",
+ " ... | \n",
+ " 131.0 | \n",
+ " 2021-02-05 11:52:05.923905+01:00 | \n",
+ " 2021-02-05 11:52:05.923905+01:00 | \n",
+ " 0aceb248607671792298436004b95275 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " a764b4bf13a360c7ac2a35ec4ca96c95 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 20873 | \n",
+ " 11.5 | \n",
+ " False | \n",
+ " 275 | \n",
+ " 137 | \n",
+ " 2020-09-03 14:46:33.589030+02:00 | \n",
+ " 2020-09-03 14:46:33.589030+02:00 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 20825 | \n",
+ " ... | \n",
+ " 137.0 | \n",
+ " 2021-02-05 11:52:05.939898+01:00 | \n",
+ " 2021-02-05 11:52:05.939898+01:00 | \n",
+ " 93002d4637331edd81ffc28b6e8e89c0 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " a764b4bf13a360c7ac2a35ec4ca96c95 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 157142 | \n",
+ " 8.0 | \n",
+ " False | \n",
+ " 82519 | \n",
+ " 9 | \n",
+ " 2022-01-28 19:29:23.525722+01:00 | \n",
+ " 2022-01-28 19:29:23.525722+01:00 | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 156773 | \n",
+ " ... | \n",
+ " 9.0 | \n",
+ " 2021-02-05 11:52:06.107939+01:00 | \n",
+ " 2021-02-05 11:52:06.107939+01:00 | \n",
+ " 7d0b25bdfff9f366da8be820608c8191 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " a764b4bf13a360c7ac2a35ec4ca96c95 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1341 | \n",
+ " 8.5 | \n",
+ " False | \n",
+ " 9 | \n",
+ " 93 | \n",
+ " 2020-09-03 13:29:30.773089+02:00 | \n",
+ " 2020-09-03 13:29:30.773089+02:00 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 1175 | \n",
+ " ... | \n",
+ " 93.0 | \n",
+ " 2021-02-05 11:52:06.004162+01:00 | \n",
+ " 2021-02-05 11:52:06.004162+01:00 | \n",
+ " 1dbb0795e8f47cb75ba7cdb08c06be5f | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " 2020-09-03 13:11:24.501197+02:00 | \n",
+ " a764b4bf13a360c7ac2a35ec4ca96c95 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 41 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id_products amount is_full_price representation_id_merge2 \\\n",
+ "0 10682 9.0 False 914 \n",
+ "1 478 9.5 False 273 \n",
+ "2 20873 11.5 False 275 \n",
+ "3 157142 8.0 False 82519 \n",
+ "4 1341 8.5 False 9 \n",
+ "\n",
+ " pricing_formula_id_merge2 created_at_products \\\n",
+ "0 114 2020-09-03 14:09:43.119798+02:00 \n",
+ "1 131 2020-09-03 13:21:22.711773+02:00 \n",
+ "2 137 2020-09-03 14:46:33.589030+02:00 \n",
+ "3 9 2022-01-28 19:29:23.525722+01:00 \n",
+ "4 93 2020-09-03 13:29:30.773089+02:00 \n",
+ "\n",
+ " updated_at_products category_id_merge2 apply_price \\\n",
+ "0 2020-09-03 14:09:43.119798+02:00 41 0.0 \n",
+ "1 2020-09-03 13:21:22.711773+02:00 1 0.0 \n",
+ "2 2020-09-03 14:46:33.589030+02:00 1 0.0 \n",
+ "3 2022-01-28 19:29:23.525722+01:00 5 0.0 \n",
+ "4 2020-09-03 13:29:30.773089+02:00 1 0.0 \n",
+ "\n",
+ " products_group_id ... pricing_formula_id \\\n",
+ "0 10655 ... 114.0 \n",
+ "1 471 ... 131.0 \n",
+ "2 20825 ... 137.0 \n",
+ "3 156773 ... 9.0 \n",
+ "4 1175 ... 93.0 \n",
+ "\n",
+ " created_at_type_of_pricing_f updated_at_type_of_pricing_f \\\n",
+ "0 2021-02-15 17:02:27.395376+01:00 2021-02-15 17:02:27.395376+01:00 \n",
+ "1 2021-02-05 11:52:05.923905+01:00 2021-02-05 11:52:05.923905+01:00 \n",
+ "2 2021-02-05 11:52:05.939898+01:00 2021-02-05 11:52:05.939898+01:00 \n",
+ "3 2021-02-05 11:52:06.107939+01:00 2021-02-05 11:52:06.107939+01:00 \n",
+ "4 2021-02-05 11:52:06.004162+01:00 2021-02-05 11:52:06.004162+01:00 \n",
+ "\n",
+ " identifier_merge4 id name_product_pack type_of \\\n",
+ "0 3706121eb9f43b635bef1433c06f679c 1 NaN 0 \n",
+ "1 0aceb248607671792298436004b95275 1 NaN 0 \n",
+ "2 93002d4637331edd81ffc28b6e8e89c0 1 NaN 0 \n",
+ "3 7d0b25bdfff9f366da8be820608c8191 1 NaN 0 \n",
+ "4 1dbb0795e8f47cb75ba7cdb08c06be5f 1 NaN 0 \n",
+ "\n",
+ " created_at updated_at \\\n",
+ "0 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n",
+ "1 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n",
+ "2 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n",
+ "3 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n",
+ "4 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n",
+ "\n",
+ " identifier_product_pack \n",
+ "0 a764b4bf13a360c7ac2a35ec4ca96c95 \n",
+ "1 a764b4bf13a360c7ac2a35ec4ca96c95 \n",
+ "2 a764b4bf13a360c7ac2a35ec4ca96c95 \n",
+ "3 a764b4bf13a360c7ac2a35ec4ca96c95 \n",
+ "4 a764b4bf13a360c7ac2a35ec4ca96c95 \n",
+ "\n",
+ "[5 rows x 41 columns]"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_product_pricing.head(5)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "03442997-806f-4285-a139-3bad46bb4522",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d22a0d75-53c5-4b54-9060-c9e7c307fb13",
+ "metadata": {},
"outputs": [],
"source": []
}