diff --git a/.gitignore b/.gitignore index f540ee9..3993ec5 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,187 @@ -.ipynb_checkpoints/Clean-Notebook-checkpoint.ipynb +# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python +# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,python + +### JupyterNotebooks ### +# gitignore template for Jupyter Notebooks +# website: http://jupyter.org/ + +.ipynb_checkpoints +*/.ipynb_checkpoints/* + +# IPython +profile_default/ +ipython_config.py + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook + +# IPython + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python \ No newline at end of file diff --git a/.gitignore.txt b/.gitignore.txt deleted file mode 100644 index ff6eda2..0000000 --- a/.gitignore.txt +++ /dev/null @@ -1,187 +0,0 @@ -# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python -# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,python - -### JupyterNotebooks ### -# gitignore template for Jupyter Notebooks -# website: http://jupyter.org/ - -.ipynb_checkpoints -*/.ipynb_checkpoints/* - -# IPython -profile_default/ -ipython_config.py - -# Remove previous ipynb_checkpoints -# git rm -r .ipynb_checkpoints/ - -### Python ### -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook - -# IPython - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ - -### Python Patch ### -# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration -poetry.toml - -# ruff -.ruff_cache/ - -# LSP config files -pyrightconfig.json - -# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python \ No newline at end of file diff --git a/.ipynb_checkpoints/Notebook_AJ-checkpoint.ipynb b/.ipynb_checkpoints/Notebook_AJ-checkpoint.ipynb deleted file mode 100644 index 2cdf609..0000000 --- a/.ipynb_checkpoints/Notebook_AJ-checkpoint.ipynb +++ /dev/null @@ -1,76 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['bdc2324-data/1',\n", - " 'bdc2324-data/10',\n", - " 'bdc2324-data/101',\n", - " 'bdc2324-data/11',\n", - " 'bdc2324-data/12',\n", - " 'bdc2324-data/13',\n", - " 'bdc2324-data/14',\n", - " 'bdc2324-data/2',\n", - " 'bdc2324-data/3',\n", - " 'bdc2324-data/4',\n", - " 'bdc2324-data/5',\n", - " 'bdc2324-data/6',\n", - " 'bdc2324-data/7',\n", - " 'bdc2324-data/8',\n", - " 'bdc2324-data/9']" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "import s3fs\n", - "\n", - "# Create filesystem object\n", - "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", - "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", - "\n", - "BUCKET = \"bdc2324-data\"\n", - "fs.ls(BUCKET)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "023bfa2b-97c2-4d53-80fb-e2290c73b92f", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb new file mode 100644 index 0000000..3f3b639 --- /dev/null +++ b/0_Cleaning_and_merge.ipynb @@ -0,0 +1,1465 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ad414c84-be46-4d2c-be8b-9fc4d24cc672", + "metadata": {}, + "source": [ + "# Business Data Challenge - Team 1" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "15103481-8d74-404c-aa09-7601fe7730da", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import s3fs\n", + "import re" + ] + }, + { + "cell_type": "markdown", + "id": "ee97665c-39af-4c1c-a62b-c9c79feae18f", + "metadata": {}, + "source": [ + "Configuration de l'accès aux données" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4", + "metadata": {}, + "outputs": [], + "source": [ + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" + ] + }, + { + "cell_type": "markdown", + "id": "9cbd72c5-6f8e-4366-ab66-96c32c6e963a", + "metadata": {}, + "source": [ + "# Exemple sur Company 1" + ] + }, + { + "cell_type": "markdown", + "id": "db26e59a-927c-407e-b54b-1815473b0b34", + "metadata": {}, + "source": [ + "## Chargement données" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "699664b9-eee4-4f8d-a207-e524526560c5", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET = \"bdc2324-data/1\"\n", + "liste_database = fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "aaf64d60-bf92-470c-8210-d09abd6a653e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1/1campaign_stats.csv',\n", + " 'bdc2324-data/1/1campaigns.csv',\n", + " 'bdc2324-data/1/1categories.csv',\n", + " 'bdc2324-data/1/1countries.csv',\n", + " 'bdc2324-data/1/1currencies.csv',\n", + " 'bdc2324-data/1/1customer_target_mappings.csv',\n", + " 'bdc2324-data/1/1customersplus.csv',\n", + " 'bdc2324-data/1/1event_types.csv',\n", + " 'bdc2324-data/1/1events.csv',\n", + " 'bdc2324-data/1/1facilities.csv',\n", + " 'bdc2324-data/1/1link_stats.csv',\n", + " 'bdc2324-data/1/1pricing_formulas.csv',\n", + " 'bdc2324-data/1/1product_packs.csv',\n", + " 'bdc2324-data/1/1products.csv',\n", + " 'bdc2324-data/1/1products_groups.csv',\n", + " 'bdc2324-data/1/1purchases.csv',\n", + " 'bdc2324-data/1/1representation_category_capacities.csv',\n", + " 'bdc2324-data/1/1representations.csv',\n", + " 'bdc2324-data/1/1seasons.csv',\n", + " 'bdc2324-data/1/1structure_tag_mappings.csv',\n", + " 'bdc2324-data/1/1suppliers.csv',\n", + " 'bdc2324-data/1/1tags.csv',\n", + " 'bdc2324-data/1/1target_types.csv',\n", + " 'bdc2324-data/1/1targets.csv',\n", + " 'bdc2324-data/1/1tickets.csv',\n", + " 'bdc2324-data/1/1type_of_categories.csv',\n", + " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n", + " 'bdc2324-data/1/1type_ofs.csv']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "liste_database" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_50143/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(file_in)\n" + ] + } + ], + "source": [ + "# loop to create dataframes from liste\n", + "files_path = liste_database\n", + "\n", + "client_number = files_path[0].split(\"/\")[1]\n", + "df_prefix = \"df\" + str(client_number) + \"_\"\n", + "\n", + "for i in range(len(files_path)) :\n", + " current_path = files_path[i]\n", + " with fs.open(current_path, mode=\"rb\") as file_in:\n", + " df = pd.read_csv(file_in)\n", + " # the pattern of the name is df1xxx\n", + " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", + " globals()[nom_dataframe] = df" + ] + }, + { + "cell_type": "markdown", + "id": "4004c8bf-11d9-413d-bb42-2cb8ddde7716", + "metadata": {}, + "source": [ + "## Cleaning functions" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d237be96-8c86-4a91-b7a1-487e87a16c3d", + "metadata": {}, + "outputs": [], + "source": [ + "def cleaning_date(df, column_name):\n", + " \"\"\"\n", + " Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n", + "\n", + " Parameters:\n", + " - df: DataFrame\n", + " Le DataFrame contenant la colonne à nettoyer.\n", + " - column_name: str\n", + " Le nom de la colonne à nettoyer.\n", + "\n", + " Returns:\n", + " - DataFrame\n", + " Le DataFrame modifié avec la colonne nettoyée.\n", + " \"\"\"\n", + " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "398804d8-2225-4fd3-bceb-75ab1588e359", + "metadata": {}, + "source": [ + "## Preprocessing" + ] + }, + { + "cell_type": "markdown", + "id": "568cb180-0dd9-4b27-aecb-05e4c3775ba6", + "metadata": {}, + "source": [ + "## customer_plus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e7b90ce-da54-4f00-bc34-64c543b0858f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "bade04b1-0cdf-4d10-bcca-7dc7e4831656", + "metadata": {}, + "source": [ + "## Ticket area" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b95464b1-26bc-4aac-84b4-45da83b92251", + "metadata": {}, + "outputs": [], + "source": [ + "# Fonction de nettoyage et selection\n", + "def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n", + " # Base des tickets\n", + " tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", + " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", + "\n", + " # Base des fournisseurs\n", + " suppliers = suppliers[['id', 'name']]\n", + " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", + "\n", + " # Base des types de billets\n", + " # type_ofs = type_ofs[['id', 'name', 'children']]\n", + " # type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n", + "\n", + " # Base des achats\n", + " # Nettoyage de la date d'achat\n", + " cleaning_date(purchases, 'purchase_date')\n", + " # Selection des variables\n", + " purchases = purchases[['id', 'purchase_date', 'customer_id']]\n", + "\n", + " # Fusions \n", + " # Fusion avec fournisseurs\n", + " ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", + " ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", + " \n", + " # # Fusion avec type de tickets\n", + " # ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n", + " # ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", + " \n", + " # Fusion avec achats\n", + " ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", + " ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n", + "\n", + " return ticket_information" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_50143/1320335767.py:5: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", + "/tmp/ipykernel_50143/1320335767.py:9: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n" + ] + } + ], + "source": [ + "df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "4b18edfc-6450-4c6a-9e7b-ee5a5808c8c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ticket_idproduct_idis_from_subscriptiontype_ofsupplier_namepurchase_datecustomer_id
013070859225251False1vente en ligne2018-12-28 14:47:50+00:0048187
113070860224914False1vente en ligne2018-12-28 14:47:50+00:0048187
213070861224914False1vente en ligne2018-12-28 14:47:50+00:0048187
313070862224914False1vente en ligne2018-12-28 14:47:50+00:0048187
413070863224914False1vente en ligne2018-12-28 14:47:50+00:0048187
........................
182666720662815405689False1vente en ligne2023-11-08 17:23:54+00:001256135
182666820662816403658False1vente en ligne2023-11-08 18:32:18+00:001256136
182666920662817403658False1vente en ligne2023-11-08 18:32:18+00:001256136
182667020662818403658False1vente en ligne2023-11-08 19:30:28+00:001256137
182667120662819403658False1vente en ligne2023-11-08 19:30:28+00:001256137
\n", + "

1826672 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ticket_id product_id is_from_subscription type_of supplier_name \\\n", + "0 13070859 225251 False 1 vente en ligne \n", + "1 13070860 224914 False 1 vente en ligne \n", + "2 13070861 224914 False 1 vente en ligne \n", + "3 13070862 224914 False 1 vente en ligne \n", + "4 13070863 224914 False 1 vente en ligne \n", + "... ... ... ... ... ... \n", + "1826667 20662815 405689 False 1 vente en ligne \n", + "1826668 20662816 403658 False 1 vente en ligne \n", + "1826669 20662817 403658 False 1 vente en ligne \n", + "1826670 20662818 403658 False 1 vente en ligne \n", + "1826671 20662819 403658 False 1 vente en ligne \n", + "\n", + " purchase_date customer_id \n", + "0 2018-12-28 14:47:50+00:00 48187 \n", + "1 2018-12-28 14:47:50+00:00 48187 \n", + "2 2018-12-28 14:47:50+00:00 48187 \n", + "3 2018-12-28 14:47:50+00:00 48187 \n", + "4 2018-12-28 14:47:50+00:00 48187 \n", + "... ... ... \n", + "1826667 2023-11-08 17:23:54+00:00 1256135 \n", + "1826668 2023-11-08 18:32:18+00:00 1256136 \n", + "1826669 2023-11-08 18:32:18+00:00 1256136 \n", + "1826670 2023-11-08 19:30:28+00:00 1256137 \n", + "1826671 2023-11-08 19:30:28+00:00 1256137 \n", + "\n", + "[1826672 rows x 7 columns]" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_ticket_information" + ] + }, + { + "cell_type": "markdown", + "id": "096e47f4-1d65-4575-989d-83227eedad2b", + "metadata": {}, + "source": [ + "## Target area" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "baed146a-9d3a-4397-a812-3d50c9a2f038", + "metadata": {}, + "outputs": [], + "source": [ + "def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):\n", + " # Target.csv cleaning\n", + " targets = targets[[\"id\", \"target_type_id\", \"name\"]]\n", + " targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n", + " \n", + " # target_type cleaning\n", + " target_types = target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n", + " \n", + " #customer_target_mappings cleaning\n", + " customer_target_mappings = customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n", + " \n", + " # Merge target et target_type\n", + " targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')\n", + " targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n", + " \n", + " # Merge\n", + " targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')\n", + " targets_full.drop(['target_id'], axis = 1, inplace=True)\n", + "\n", + " return targets_full" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5fbfd88b-b94c-489c-9201-670e96e453e7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_50143/3848597476.py:4: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n" + ] + } + ], + "source": [ + "df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b4f05142-2a22-42ef-a60d-f23cc4b5cb09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_id
target_name
consentement optin mediation specialisee150000
consentement optin jeune public149979
consentement optin b2c108909
Arenametrix_bascule tel vers sib35216
consentement optout b2c34523
......
Automation_parrainage_newsletter_handicap_visuel1
consentement optout mediation specialisee1
Inscrits NL LSF formulaire1
Market auto - contacts inactifs post-scénario1
Inactifs - fin du scénario1
\n", + "

283 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " customer_id\n", + "target_name \n", + "consentement optin mediation specialisee 150000\n", + "consentement optin jeune public 149979\n", + "consentement optin b2c 108909\n", + "Arenametrix_bascule tel vers sib 35216\n", + "consentement optout b2c 34523\n", + "... ...\n", + "Automation_parrainage_newsletter_handicap_visuel 1\n", + "consentement optout mediation specialisee 1\n", + "Inscrits NL LSF formulaire 1\n", + "Market auto - contacts inactifs post-scénario 1\n", + "Inactifs - fin du scénario 1\n", + "\n", + "[283 rows x 1 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_target_information[['target_name', 'customer_id']].groupby('target_name').count().sort_values(by='customer_id', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4417ff51-f501-4ab9-a192-4ab75764a8ed", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_id
target_name
Arenametrix_bascule tel vers sib35216
Autres_interet_exposition1021
COM Inscrits NL générale (historique)23005
Contacts_prenomsdoubles11643
DDCP MD Procès du Siècle1684
DDCP Newsletter centres de loisirs1032
DDCP Newsletter enseignants4510
DDCP Newsletter jeune public3862
DDCP Newsletter relais champ social2270
DDCP PROMO Participants ateliers (adultes et enfants)1954
DDCP billets famille3609
DDCP promo MD pass musées dps oct 20181785
DDCP promo Plan B 2019 (concerts)1948
DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers)1293
DDCP rentrée culturelle 20231757
DDCP_marseille_jazz_20231043
DRE Festival Jean Rouch1502
DRE MucemLab2302
DRE chercheurs1557
DRE institutionnels2229
FORMATION _ acheteurs optin last year10485
Inscrits NL générale (export_291019 + operation_videomaton)14086
Inscrits NL générale site web3732
Inscrits NL jeune public site web1249
Votre première liste3715
consentement optin b2b12735
consentement optin b2c108909
consentement optin dre4527
consentement optin jeune public149979
consentement optin mediation specialisee150000
consentement optin newsletter generale22095
consentement optin scolaires4849
consentement optout b2b14219
consentement optout b2c34523
consentement optout dre14328
consentement optout newsletter generale18855
consentement optout scolaires15744
ddcp_md_scene_ouverte_au_talent1577
ddcp_promo_MD_billet_musée_oct_2019_agarder25482
ddcp_promo_md_musée_dps 0110196010
ddcp_promo_visiteurs occasionnels_musee_8mois6640
ddcp_visiteurs dps 01062212355
festival_jean_rouch1502
rappel po barvalo1248
structures_etiquette champ social1488
\n", + "
" + ], + "text/plain": [ + " customer_id\n", + "target_name \n", + "Arenametrix_bascule tel vers sib 35216\n", + "Autres_interet_exposition 1021\n", + "COM Inscrits NL générale (historique) 23005\n", + "Contacts_prenomsdoubles 11643\n", + "DDCP MD Procès du Siècle 1684\n", + "DDCP Newsletter centres de loisirs 1032\n", + "DDCP Newsletter enseignants 4510\n", + "DDCP Newsletter jeune public 3862\n", + "DDCP Newsletter relais champ social 2270\n", + "DDCP PROMO Participants ateliers (adultes et en... 1954\n", + "DDCP billets famille 3609\n", + "DDCP promo MD pass musées dps oct 2018 1785\n", + "DDCP promo Plan B 2019 (concerts) 1948\n", + "DDCP promo spectateurs prog 21-22 (spectacles, ... 1293\n", + "DDCP rentrée culturelle 2023 1757\n", + "DDCP_marseille_jazz_2023 1043\n", + "DRE Festival Jean Rouch 1502\n", + "DRE MucemLab 2302\n", + "DRE chercheurs 1557\n", + "DRE institutionnels 2229\n", + "FORMATION _ acheteurs optin last year 10485\n", + "Inscrits NL générale (export_291019 + operation... 14086\n", + "Inscrits NL générale site web 3732\n", + "Inscrits NL jeune public site web 1249\n", + "Votre première liste 3715\n", + "consentement optin b2b 12735\n", + "consentement optin b2c 108909\n", + "consentement optin dre 4527\n", + "consentement optin jeune public 149979\n", + "consentement optin mediation specialisee 150000\n", + "consentement optin newsletter generale 22095\n", + "consentement optin scolaires 4849\n", + "consentement optout b2b 14219\n", + "consentement optout b2c 34523\n", + "consentement optout dre 14328\n", + "consentement optout newsletter generale 18855\n", + "consentement optout scolaires 15744\n", + "ddcp_md_scene_ouverte_au_talent 1577\n", + "ddcp_promo_MD_billet_musée_oct_2019_agarder2 5482\n", + "ddcp_promo_md_musée_dps 011019 6010\n", + "ddcp_promo_visiteurs occasionnels_musee_8mois 6640\n", + "ddcp_visiteurs dps 010622 12355\n", + "festival_jean_rouch 1502\n", + "rappel po barvalo 1248\n", + "structures_etiquette champ social 1488" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_target_information_reduced = df1_target_information[['target_name', 'customer_id']].groupby('target_name').count()\n", + "df1_target_information_reduced[df1_target_information_reduced['customer_id'] >= 1000]" + ] + }, + { + "cell_type": "markdown", + "id": "cdbb48b4-5e16-4ef4-8791-ed213d68d52f", + "metadata": {}, + "source": [ + "## Campaings area" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d883cc7b-ac43-4485-b86f-eaf595fbad85", + "metadata": {}, + "outputs": [], + "source": [ + "def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):\n", + " # campaign_stats cleaning \n", + " campaign_stats = campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n", + " cleaning_date(campaign_stats, 'opened_at')\n", + " cleaning_date(campaign_stats, 'sent_at')\n", + " cleaning_date(campaign_stats, 'delivered_at')\n", + " \n", + " # campaigns cleaning\n", + " campaigns = campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n", + " cleaning_date(campaigns, 'campaign_sent_at')\n", + " \n", + " # Merge \n", + " campaigns_full = pd.merge(campaign_stats, campaigns, on = \"campaign_id\", how = \"left\")\n", + " campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)\n", + "\n", + " return campaigns_full" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c8552dd6-52c5-4431-b43d-3cd6c578fd9f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", + "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", + "/tmp/ipykernel_50143/1967867975.py:15: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n" + ] + } + ], + "source": [ + "df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "c24457e7-3cad-451a-a65b-7373b656bd6e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
019793112597NaT2021-03-28 16:01:09+00:002021-03-28 16:24:18+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
114211113666NaT2021-03-28 16:01:09+00:002021-03-28 16:21:02+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
213150280561NaT2021-03-28 16:00:59+00:002021-03-28 16:08:45+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
370731010072021-03-28 18:11:06+00:002021-03-28 16:00:59+00:002021-03-28 16:09:47+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
45175103972NaT2021-03-28 16:01:06+00:002021-03-28 16:05:03+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
...........................
621480383029942661552023-10-23 09:43:25+00:002023-10-23 09:32:33+00:002023-10-23 09:32:34+00:00dre_nov_202313182023-10-23 09:31:17+00:00
62148048303307213552023-10-23 09:44:02+00:002023-10-23 09:32:49+00:002023-10-23 09:32:49+00:00dre_nov_202313182023-10-23 09:31:17+00:00
62148058304346218492023-10-23 09:45:52+00:002023-10-23 09:33:28+00:002023-10-23 09:33:29+00:00dre_nov_202313182023-10-23 09:31:17+00:00
621480683020376677892023-10-23 09:47:32+00:002023-10-23 09:31:53+00:002023-10-23 09:31:54+00:00dre_nov_202313182023-10-23 09:31:17+00:00
62148078304939294154NaT2023-10-23 09:33:54+00:002023-10-23 09:33:55+00:00dre_nov_202313182023-10-23 09:31:17+00:00
\n", + "

6214808 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " id customer_id opened_at \\\n", + "0 19793 112597 NaT \n", + "1 14211 113666 NaT \n", + "2 13150 280561 NaT \n", + "3 7073 101007 2021-03-28 18:11:06+00:00 \n", + "4 5175 103972 NaT \n", + "... ... ... ... \n", + "6214803 8302994 266155 2023-10-23 09:43:25+00:00 \n", + "6214804 8303307 21355 2023-10-23 09:44:02+00:00 \n", + "6214805 8304346 21849 2023-10-23 09:45:52+00:00 \n", + "6214806 8302037 667789 2023-10-23 09:47:32+00:00 \n", + "6214807 8304939 294154 NaT \n", + "\n", + " sent_at delivered_at \\\n", + "0 2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00 \n", + "1 2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00 \n", + "2 2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00 \n", + "3 2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00 \n", + "4 2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00 \n", + "... ... ... \n", + "6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00 \n", + "6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00 \n", + "6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00 \n", + "6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00 \n", + "6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00 \n", + "\n", + " campaign_name campaign_service_id \\\n", + "0 Le Mucem chez vous, gardons le lien #22 404 \n", + "1 Le Mucem chez vous, gardons le lien #22 404 \n", + "2 Le Mucem chez vous, gardons le lien #22 404 \n", + "3 Le Mucem chez vous, gardons le lien #22 404 \n", + "4 Le Mucem chez vous, gardons le lien #22 404 \n", + "... ... ... \n", + "6214803 dre_nov_2023 1318 \n", + "6214804 dre_nov_2023 1318 \n", + "6214805 dre_nov_2023 1318 \n", + "6214806 dre_nov_2023 1318 \n", + "6214807 dre_nov_2023 1318 \n", + "\n", + " campaign_sent_at \n", + "0 2021-03-27 23:00:00+00:00 \n", + "1 2021-03-27 23:00:00+00:00 \n", + "2 2021-03-27 23:00:00+00:00 \n", + "3 2021-03-27 23:00:00+00:00 \n", + "4 2021-03-27 23:00:00+00:00 \n", + "... ... \n", + "6214803 2023-10-23 09:31:17+00:00 \n", + "6214804 2023-10-23 09:31:17+00:00 \n", + "6214805 2023-10-23 09:31:17+00:00 \n", + "6214806 2023-10-23 09:31:17+00:00 \n", + "6214807 2023-10-23 09:31:17+00:00 \n", + "\n", + "[6214808 rows x 8 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_campaigns_information" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "e2c88552-b863-47a2-be23-8d2898fb28bc", + "metadata": {}, + "outputs": [], + "source": [ + "def campaigns_kpi(campaigns_information = None):\n", + " # Nombre de campagnes de mails\n", + " nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n", + " nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)\n", + " # Temps d'ouverture en min moyen \n", + " campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']\n", + " time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()\n", + "\n", + " # Nombre de mail ouvert \n", + " opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]\n", + " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n", + " opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()\n", + " opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)\n", + "\n", + " # Fusion des indicateurs\n", + " campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')\n", + " campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')\n", + "\n", + " # Remplir les NaN : nb_campaigns_opened\n", + " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n", + "\n", + " # Remplir les NaT : time_to_open (??)\n", + "\n", + " return campaigns_reduced\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "24537647-bc29-4777-9848-ac4120a4aa60", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_50143/2679359833.py:11: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " opened_campaign.dropna(subset=['opened_at'], inplace=True)\n", + "/tmp/ipykernel_50143/2679359833.py:20: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.\n", + "The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.\n", + "\n", + "For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.\n", + "\n", + "\n", + " campaigns_reduced['nb_campaigns_opened'].fillna(0, inplace=True)\n" + ] + } + ], + "source": [ + "df1_campaigns_kpi = campaigns_kpi(campaigns_information = df1_campaigns_information) " + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idnb_campaignsnb_campaigns_openedtime_to_open
0240.0NaT
13222124.01 days 00:28:30.169354838
2477.01 days 04:31:01.428571428
3540.0NaT
46200.0NaT
...............
130467125609711.00 days 02:11:15
130468125609810.0NaT
130469125609910.0NaT
130470125610010.0NaT
130471125610110.0NaT
\n", + "

130472 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " customer_id nb_campaigns nb_campaigns_opened \\\n", + "0 2 4 0.0 \n", + "1 3 222 124.0 \n", + "2 4 7 7.0 \n", + "3 5 4 0.0 \n", + "4 6 20 0.0 \n", + "... ... ... ... \n", + "130467 1256097 1 1.0 \n", + "130468 1256098 1 0.0 \n", + "130469 1256099 1 0.0 \n", + "130470 1256100 1 0.0 \n", + "130471 1256101 1 0.0 \n", + "\n", + " time_to_open \n", + "0 NaT \n", + "1 1 days 00:28:30.169354838 \n", + "2 1 days 04:31:01.428571428 \n", + "3 NaT \n", + "4 NaT \n", + "... ... \n", + "130467 0 days 02:11:15 \n", + "130468 NaT \n", + "130469 NaT \n", + "130470 NaT \n", + "130471 NaT \n", + "\n", + "[130472 rows x 4 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_campaigns_kpi" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Brouillon_AJ.ipynb b/Brouillon_AJ.ipynb new file mode 100644 index 0000000..8f5529a --- /dev/null +++ b/Brouillon_AJ.ipynb @@ -0,0 +1,695 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab", + "metadata": {}, + "source": [ + "# Business Data Challenge - Team 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88af2795-8bf9-4df0-a059-be7c28fb4289", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4", + "metadata": {}, + "source": [ + "Configuration de l'accès aux données" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import s3fs\n", + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", + "\n", + "BUCKET = \"bdc2324-data\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763", + "metadata": {}, + "outputs": [], + "source": [ + "# Chargement des fichiers campaign_stats.csv\n", + "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56", + "metadata": {}, + "outputs": [], + "source": [ + "# Conversion des dates 'sent_at'\n", + "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n", + "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n", + "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135", + "metadata": {}, + "outputs": [], + "source": [ + "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n", + "print(campaign_stats_1['sent_at'].max())\n", + "print(campaign_stats_1['sent_at'].min())\n", + "\n", + "print(campaign_stats_2['sent_at'].max())\n", + "print(campaign_stats_2['sent_at'].min())\n", + "\n", + "print(campaign_stats_3['sent_at'].max())\n", + "print(campaign_stats_3['sent_at'].min())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b", + "metadata": {}, + "outputs": [], + "source": [ + "campaign_stats_1['sent_at']" + ] + }, + { + "cell_type": "markdown", + "id": "31f2edbf-5661-4516-9835-06d4da615c13", + "metadata": {}, + "source": [ + "### Customersplus.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092", + "metadata": {}, + "outputs": [], + "source": [ + "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " customers_plus_2 = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "460f853a-68c0-42a7-9877-b83d3aaec813", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5a9398f-72fc-4548-9f53-b20b372144b2", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_1['id'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_2['id'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b40a653e-013f-48d0-8b57-0284587b36c5", + "metadata": {}, + "outputs": [], + "source": [ + "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32fa2215-3c79-40b5-8643-755865959fc7", + "metadata": {}, + "outputs": [], + "source": [ + "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n", + "# Exemple id commun = caractéristiques communes\n", + "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n", + "\n", + "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "customers_plus_1.isna().mean()*100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f6ce60d-0912-497d-9108-330acccef394", + "metadata": {}, + "outputs": [], + "source": [ + "# Chargement de toutes les données\n", + "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n", + "\n", + "for nom_base in liste_base:\n", + " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n", + " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Jointure\n", + "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n", + "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n", + "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n", + "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n", + "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", + "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n", + "df_customer_event" + ] + }, + { + "cell_type": "markdown", + "id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f", + "metadata": {}, + "source": [ + "# Fusion et exploration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22bfad2b-d52a-4077-9b39-bee35004e01c", + "metadata": {}, + "outputs": [], + "source": [ + "# Jointure\n", + "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n", + "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n", + "\n", + "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n", + "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n", + "\n", + "var_choosed.remove('representation_id')\n", + "var_choosed.extend(['start_date_time', 'event_id'])\n", + "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n", + "\n", + "var_choosed.remove('event_id')\n", + "var_choosed.extend(['name', 'customer_id'])\n", + "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n", + "\n", + "# Changement de nom\n", + "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", + "var_choosed[var_choosed.index('name')] = \"event_name\"\n", + "\n", + "# Base finale\n", + "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n", + "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n", + "df_customer_event" + ] + }, + { + "cell_type": "markdown", + "id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31", + "metadata": {}, + "source": [ + "## Type de client au globale" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1", + "metadata": {}, + "outputs": [], + "source": [ + "# Client\n", + "print(customer_target_mappings.columns)\n", + "print(customer_target_mappings.shape)\n", + "customer_target_mappings.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f", + "metadata": {}, + "outputs": [], + "source": [ + "customer_target_mappings['extra_field'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec", + "metadata": {}, + "outputs": [], + "source": [ + "customer_target_mappings['name'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4", + "metadata": {}, + "outputs": [], + "source": [ + "# Segmentation existante\n", + "print(target_types.columns)\n", + "print(target_types.shape)\n", + "target_types.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5adb1773-648d-4683-bc08-d1f2298c1283", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "target_types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d65f74e-47fc-4296-b493-a1ebefb91cde", + "metadata": {}, + "outputs": [], + "source": [ + "# Tags = clients\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " tags = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(tags.columns)\n", + "print(tags.shape)\n", + "tags.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a689a63-165b-4c4e-bbb0-695b661048d9", + "metadata": {}, + "outputs": [], + "source": [ + "tags" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69e38c52-0570-4531-aebb-9deb6db8c40b", + "metadata": {}, + "outputs": [], + "source": [ + "# Structure = clients\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(structure_tag_mappings.columns)\n", + "print(structure_tag_mappings.shape)\n", + "structure_tag_mappings.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74dc34ad-375b-48df-a900-40d92c5fff13", + "metadata": {}, + "outputs": [], + "source": [ + "structure_tag_mappings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe", + "metadata": {}, + "outputs": [], + "source": [ + "# Tags = clients\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " customersplus = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(customersplus.columns)\n", + "print(customersplus.shape)\n", + "customersplus.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "383e892c-606a-45ce-bdd6-b503b3e0be33", + "metadata": {}, + "outputs": [], + "source": [ + "customersplus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70324d06-b855-4386-a7de-eef1eb13dfdf", + "metadata": {}, + "outputs": [], + "source": [ + "# But : lier les caractéristiques socio-demo et les comportements d'achat\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c", + "metadata": {}, + "outputs": [], + "source": [ + "# tickets\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " tickets = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(tickets.columns)\n", + "print(tickets.shape)\n", + "tickets.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea83ea5c-3d47-4a66-a523-04b69b149a20", + "metadata": {}, + "outputs": [], + "source": [ + "tickets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6", + "metadata": {}, + "outputs": [], + "source": [ + "tickets['type_of'].unique()" + ] + }, + { + "cell_type": "markdown", + "id": "bc192b08-30a5-486a-8bea-93e765dbfce6", + "metadata": {}, + "source": [ + "## Types d'évenement et client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e14dcf62-2def-4ed5-834b-cf21abbc2894", + "metadata": {}, + "outputs": [], + "source": [ + "# Evenement = events.csv\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " events = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(events.columns)\n", + "print(events.shape)\n", + "events.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316", + "metadata": {}, + "outputs": [], + "source": [ + "events" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af80eee8-f717-4159-a0fd-09d47ec96621", + "metadata": {}, + "outputs": [], + "source": [ + "events['name'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2", + "metadata": {}, + "outputs": [], + "source": [ + "# Représentation des évenements = representations.csv\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " representations = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(representations.columns)\n", + "print(representations.shape)\n", + "representations.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1487402a-a49b-4737-b7d7-40c764d2f0b4", + "metadata": {}, + "outputs": [], + "source": [ + "representations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99b27418-2c15-4a6e-bcf5-d329ca492085", + "metadata": {}, + "outputs": [], + "source": [ + "# Produits vendues = products.csv\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " products = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(products.columns)\n", + "print(products.shape)\n", + "products.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c49bcd47-672f-4e0f-aee9-a7475151b97f", + "metadata": {}, + "outputs": [], + "source": [ + "products" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4aec5ce-d0c9-4625-bb29-9ac154818621", + "metadata": {}, + "outputs": [], + "source": [ + "# Lieu = facilities.csv\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " facilities = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(facilities.columns)\n", + "print(facilities.shape)\n", + "facilities.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3642483-2879-442a-ad69-efcd2331a200", + "metadata": {}, + "outputs": [], + "source": [ + "facilities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da1e9807-2a8d-4be7-a785-55cffd734f36", + "metadata": {}, + "outputs": [], + "source": [ + "# Saisons = seasons.csv période sur deux années consécutives\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " seasons = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(seasons.columns)\n", + "print(seasons.shape)\n", + "seasons.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9", + "metadata": {}, + "outputs": [], + "source": [ + "seasons['name'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abb3aa20-774b-4761-983a-df5eb2bc51c6", + "metadata": {}, + "outputs": [], + "source": [ + "# Achats = purchases.csv \n", + "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " purchases = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(purchases.columns)\n", + "print(purchases.shape)\n", + "purchases.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30e204ab-4f63-430c-a818-5c8035b6e17b", + "metadata": {}, + "outputs": [], + "source": [ + "purchases" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Clean-Notebook.ipynb b/Clean-Notebook.ipynb deleted file mode 100644 index 1f70494..0000000 --- a/Clean-Notebook.ipynb +++ /dev/null @@ -1,3921 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "56b3d44e-1e3f-4726-9916-0f9af107860e", - "metadata": {}, - "source": [ - "# Business Data Challenge - Team 1" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "15103481-8d74-404c-aa09-7601fe7730da", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import os\n", - "import s3fs\n", - "import re" - ] - }, - { - "cell_type": "markdown", - "id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e", - "metadata": {}, - "source": [ - "Configuration de l'accès aux données" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4", - "metadata": {}, - "outputs": [], - "source": [ - "# Create filesystem object\n", - "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", - "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" - ] - }, - { - "cell_type": "markdown", - "id": "f99da24f-0d93-4618-92bc-3ba81dc0445c", - "metadata": {}, - "source": [ - "# Exemple sur Company 1" - ] - }, - { - "cell_type": "markdown", - "id": "9d74b68f-ba07-4a15-9a27-dae931762d70", - "metadata": {}, - "source": [ - "## Chargement données" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "699664b9-eee4-4f8d-a207-e524526560c5", - "metadata": {}, - "outputs": [], - "source": [ - "BUCKET = \"bdc2324-data/1\"\n", - "liste_database = fs.ls(BUCKET)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "0cb92854-903b-4efd-ac1b-197e29f044b4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['bdc2324-data/1/1campaign_stats.csv', 'bdc2324-data/1/1campaigns.csv', 'bdc2324-data/1/1customer_target_mappings.csv', 'bdc2324-data/1/1customersplus.csv', 'bdc2324-data/1/1event_types.csv', 'bdc2324-data/1/1events.csv', 'bdc2324-data/1/1product_packs.csv', 'bdc2324-data/1/1products.csv', 'bdc2324-data/1/1products_groups.csv', 'bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1target_types.csv', 'bdc2324-data/1/1targets.csv', 'bdc2324-data/1/1tickets.csv']\n" - ] - } - ], - "source": [ - "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'customer', 'event', 'target', 'prod', 'campa']\n", - "\n", - "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", - "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n", - "\n", - "# Afficher le résultat\n", - "print(liste_database_filtered)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_4561/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " df = pd.read_csv(file_in)\n" - ] - } - ], - "source": [ - "# loop to create dataframes from file 2\n", - "files_path = liste_database_filtered\n", - "\n", - "client_number = files_path[0].split(\"/\")[1]\n", - "df_prefix = \"df\" + str(client_number) + \"_\"\n", - "\n", - "for i in range(len(files_path)) :\n", - " current_path = files_path[i]\n", - " with fs.open(current_path, mode=\"rb\") as file_in:\n", - " df = pd.read_csv(file_in)\n", - " # the pattern of the name is df1xxx\n", - " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", - " globals()[nom_dataframe] = df" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "7d1da9df-f423-4a9f-a2a6-6d8ceeab1c34", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "_\n", - "__\n", - "___\n", - "df\n", - "df1_purchases\n", - "df1_suppliers\n", - "df1_tickets\n", - "dataframe\n", - "_7\n", - "_10\n", - "_11\n", - "_18\n", - "_20\n", - "df1_customer_target_mappings\n", - "df1_customersplus\n", - "df1_event_types\n", - "df1_events\n", - "df1_target_types\n", - "df1_targets\n" - ] - } - ], - "source": [ - "# Obtenir toutes les variables globales\n", - "variables_globales = globals()\n", - "\n", - "# Filtrer les variables pour obtenir uniquement les DataFrames\n", - "dataframes = {nom: variable for nom, variable in variables_globales.items() if isinstance(variable, pd.DataFrame)}\n", - "\n", - "# Afficher les noms et les DataFrames\n", - "for nom, dataframe in dataframes.items():\n", - " print(f\"{nom}\")" - ] - }, - { - "cell_type": "markdown", - "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52", - "metadata": {}, - "source": [ - "## suppliers.csv" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "2e0dada0-9457-484c-aa55-77e44613ecca", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamemanually_addedlabelitrupdated_atcreated_atcommissionidentifier
01617j4 administrationFalseNaNNaN2021-07-29 09:21:37.325772+02:002021-07-29 09:21:37.325772+02:00NaN5958b2a060ac3e31678b438892a1bd2e
18non définiFalseNaNNaN2020-09-03 13:16:35.329062+02:002020-09-03 13:16:35.329062+02:00NaN52ff3466787b4d538407372e5f7afe0f
24vadFalseNaNNaN2020-09-03 13:11:23.896992+02:002020-09-03 13:11:23.896992+02:00NaN1225483c97b36018cab2bea14ab78ea6
31fort saint jeanFalseNaNNaN2020-09-03 13:11:23.833073+02:002020-09-03 13:11:23.833073+02:00NaN001b9b4a524fe407150b8235b304d4ec
42j4FalseNaNNaN2020-09-03 13:11:23.888993+02:002020-09-03 13:11:23.888993+02:00NaN6a0cf6edf20060344b465706b61719aa
55revendeurFalseNaNNaN2020-09-03 13:11:23.900987+02:002020-09-03 13:11:23.900987+02:00NaN931239d4acb6214d7e5c98edecfb4916
63vente en ligneFalseNaNNaN2020-09-03 13:11:23.893097+02:002020-09-03 13:11:23.893097+02:00NaNbde8f2ccff510df8572d3214d86b837d
76ccrFalseNaNNaN2020-09-03 13:11:23.904974+02:002020-09-03 13:11:23.904974+02:00NaNb48ec279411f7dbbb68393c61a9724d9
87dabFalseNaNNaN2020-09-03 13:11:23.908970+02:002020-09-03 13:11:23.908970+02:00NaN11c6d471fa4e354e62e684d293694202
\n", - "
" - ], - "text/plain": [ - " id name manually_added label itr \\\n", - "0 1617 j4 administration False NaN NaN \n", - "1 8 non défini False NaN NaN \n", - "2 4 vad False NaN NaN \n", - "3 1 fort saint jean False NaN NaN \n", - "4 2 j4 False NaN NaN \n", - "5 5 revendeur False NaN NaN \n", - "6 3 vente en ligne False NaN NaN \n", - "7 6 ccr False NaN NaN \n", - "8 7 dab False NaN NaN \n", - "\n", - " updated_at created_at \\\n", - "0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n", - "1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n", - "2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n", - "3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n", - "4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n", - "5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n", - "6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n", - "7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n", - "8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n", - "\n", - " commission identifier \n", - "0 NaN 5958b2a060ac3e31678b438892a1bd2e \n", - "1 NaN 52ff3466787b4d538407372e5f7afe0f \n", - "2 NaN 1225483c97b36018cab2bea14ab78ea6 \n", - "3 NaN 001b9b4a524fe407150b8235b304d4ec \n", - "4 NaN 6a0cf6edf20060344b465706b61719aa \n", - "5 NaN 931239d4acb6214d7e5c98edecfb4916 \n", - "6 NaN bde8f2ccff510df8572d3214d86b837d \n", - "7 NaN b48ec279411f7dbbb68393c61a9724d9 \n", - "8 NaN 11c6d471fa4e354e62e684d293694202 " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Restriction aux DataFrame : ticket, purchase, consumption, suppliers\n", - "df1_suppliers" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "b583be02-ab60-4e14-9325-0204f203a1af", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 9 entries, 0 to 8\n", - "Data columns (total 9 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 9 non-null int64 \n", - " 1 name 9 non-null object \n", - " 2 manually_added 9 non-null bool \n", - " 3 label 0 non-null float64\n", - " 4 itr 0 non-null float64\n", - " 5 updated_at 9 non-null object \n", - " 6 created_at 9 non-null object \n", - " 7 commission 0 non-null float64\n", - " 8 identifier 9 non-null object \n", - "dtypes: bool(1), float64(3), int64(1), object(4)\n", - "memory usage: 713.0+ bytes\n" - ] - } - ], - "source": [ - "df1_suppliers.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0
id0.0
name0.0
manually_added0.0
label100.0
itr100.0
updated_at0.0
created_at0.0
commission100.0
identifier0.0
\n", - "
" - ], - "text/plain": [ - " 0\n", - "id 0.0\n", - "name 0.0\n", - "manually_added 0.0\n", - "label 100.0\n", - "itr 100.0\n", - "updated_at 0.0\n", - "created_at 0.0\n", - "commission 100.0\n", - "identifier 0.0" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.DataFrame(df1_suppliers.isna().mean()*100)" - ] - }, - { - "cell_type": "markdown", - "id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72", - "metadata": {}, - "source": [ - "## purchases.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "79c9eb43-002e-460d-acb2-206ebb2ab6dd", - "metadata": {}, - "source": [ - "## tickets.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618", - "metadata": {}, - "outputs": [], - "source": [ - "df1_purchases\n", - "df1_tickets" - ] - }, - { - "cell_type": "markdown", - "id": "355f5489-7904-4161-a85b-6eb70b3a4c89", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "# Fusion et exploration" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "afe548fe-d93c-4634-9f53-881404ec4c6c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id_xpurchase_datetype_ofis_from_subscriptionamountis_full_pricestart_date_timeevent_name
09924232023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
19924232023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
210539342023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
310539342023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
411891412020-11-26 13:12:53+01:003False51.3False2020-12-01 20:00:00+01:00iphigenie en tauride
...........................
31896410908392019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896510908392019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896610908392019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896712442772019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
31896812442772019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
\n", - "

318969 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " id_x purchase_date type_of is_from_subscription \\\n", - "0 992423 2023-01-11 17:08:41+01:00 3 False \n", - "1 992423 2023-01-11 17:08:41+01:00 3 False \n", - "2 1053934 2023-03-16 16:23:10+01:00 3 False \n", - "3 1053934 2023-03-16 16:23:10+01:00 3 False \n", - "4 1189141 2020-11-26 13:12:53+01:00 3 False \n", - "... ... ... ... ... \n", - "318964 1090839 2019-05-19 21:18:36+02:00 1 False \n", - "318965 1090839 2019-05-19 21:18:36+02:00 1 False \n", - "318966 1090839 2019-05-19 21:18:36+02:00 1 False \n", - "318967 1244277 2019-12-31 11:04:07+01:00 1 False \n", - "318968 1244277 2019-12-31 11:04:07+01:00 1 False \n", - "\n", - " amount is_full_price start_date_time event_name \n", - "0 13.0 False 2023-02-06 20:00:00+01:00 zaide \n", - "1 13.0 False 2023-02-06 20:00:00+01:00 zaide \n", - "2 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n", - "3 62.0 False 2023-03-19 16:00:00+01:00 luisa miller \n", - "4 51.3 False 2020-12-01 20:00:00+01:00 iphigenie en tauride \n", - "... ... ... ... ... \n", - "318964 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n", - "318965 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n", - "318966 4.5 False 2019-05-27 20:00:00+02:00 entre femmes \n", - "318967 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n", - "318968 5.5 False 2020-02-03 20:00:00+01:00 a boire et a manger \n", - "\n", - "[318969 rows x 8 columns]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Jointure\n", - "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n", - "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n", - "\n", - "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n", - "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n", - "\n", - "var_choosed.remove('representation_id')\n", - "var_choosed.extend(['start_date_time', 'event_id'])\n", - "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n", - "\n", - "var_choosed.remove('event_id')\n", - "var_choosed.extend(['name', 'customer_id'])\n", - "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n", - "\n", - "# Changement de nom\n", - "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", - "var_choosed[var_choosed.index('name')] = \"event_name\"\n", - "\n", - "# Base finale\n", - "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n", - "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n", - "df_customer_event" - ] - }, - { - "cell_type": "markdown", - "id": "779da86b-ac61-4c61-88d2-fa1c0c19efce", - "metadata": {}, - "source": [ - "## Type de client au globale" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "7c89d25f-ee42-4478-9ff0-ee64b781d5c8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'customer_id', 'target_id', 'created_at', 'updated_at', 'name',\n", - " 'extra_field'],\n", - " dtype='object')\n", - "(124302, 7)\n", - "\n", - "RangeIndex: 124302 entries, 0 to 124301\n", - "Data columns (total 7 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 124302 non-null int64 \n", - " 1 customer_id 124302 non-null int64 \n", - " 2 target_id 124302 non-null int64 \n", - " 3 created_at 124296 non-null object \n", - " 4 updated_at 124296 non-null object \n", - " 5 name 0 non-null float64\n", - " 6 extra_field 0 non-null float64\n", - "dtypes: float64(2), int64(3), object(2)\n", - "memory usage: 6.6+ MB\n" - ] - } - ], - "source": [ - "# Client\n", - "print(customer_target_mappings.columns)\n", - "print(customer_target_mappings.shape)\n", - "customer_target_mappings.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "c4b6bdcc-9f13-449b-9a8b-c5ca794637be", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([nan])" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "customer_target_mappings['extra_field'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "47bc8453-0693-4838-8bd8-4d800a82c496", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([nan])" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "customer_target_mappings['name'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "ab3f937b-ef62-499a-8ee2-d47d1d988ace", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'is_import', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n", - "(4, 6)\n", - "\n", - "RangeIndex: 4 entries, 0 to 3\n", - "Data columns (total 6 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 4 non-null int64 \n", - " 1 is_import 4 non-null bool \n", - " 2 name 4 non-null object\n", - " 3 created_at 4 non-null object\n", - " 4 updated_at 4 non-null object\n", - " 5 identifier 4 non-null object\n", - "dtypes: bool(1), int64(1), object(4)\n", - "memory usage: 292.0+ bytes\n" - ] - } - ], - "source": [ - "# Segmentation existante\n", - "print(target_types.columns)\n", - "print(target_types.shape)\n", - "target_types.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idis_importnamecreated_atupdated_atidentifier
01Falsemanual_static_filter2021-04-29 13:42:14.111085+02:002021-04-29 13:42:14.111085+02:00fb27e81baa4debc6a4e1a8639c20e808
13Truemanual_structure2021-05-07 15:20:00.626650+02:002021-05-07 15:20:00.626650+02:00382bca214204a2d3462f5ec2728d5d1e
26Falsemanual_dynamic_filter2021-09-09 14:27:47.641302+02:002021-09-09 14:27:47.641302+02:00e0f4b8693184850fefd6d2a38f10584e
32Truemanual_import2021-04-29 13:49:30.107110+02:002021-04-29 13:49:30.107110+02:0012213df2ce68a624e4c0070521437bac
\n", - "
" - ], - "text/plain": [ - " id is_import name created_at \\\n", - "0 1 False manual_static_filter 2021-04-29 13:42:14.111085+02:00 \n", - "1 3 True manual_structure 2021-05-07 15:20:00.626650+02:00 \n", - "2 6 False manual_dynamic_filter 2021-09-09 14:27:47.641302+02:00 \n", - "3 2 True manual_import 2021-04-29 13:49:30.107110+02:00 \n", - "\n", - " updated_at identifier \n", - "0 2021-04-29 13:42:14.111085+02:00 fb27e81baa4debc6a4e1a8639c20e808 \n", - "1 2021-05-07 15:20:00.626650+02:00 382bca214204a2d3462f5ec2728d5d1e \n", - "2 2021-09-09 14:27:47.641302+02:00 e0f4b8693184850fefd6d2a38f10584e \n", - "3 2021-04-29 13:49:30.107110+02:00 12213df2ce68a624e4c0070521437bac " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_types" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "8dd74e87-97c2-493d-b19f-971b684078d3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n", - "(20, 5)\n", - "\n", - "RangeIndex: 20 entries, 0 to 19\n", - "Data columns (total 5 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 20 non-null int64 \n", - " 1 name 19 non-null object\n", - " 2 created_at 20 non-null object\n", - " 3 updated_at 20 non-null object\n", - " 4 identifier 20 non-null object\n", - "dtypes: int64(1), object(4)\n", - "memory usage: 928.0+ bytes\n" - ] - } - ], - "source": [ - "# Tags = clients\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " tags = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(tags.columns)\n", - "print(tags.shape)\n", - "tags.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "91d54732-666c-4250-ba91-5c9b83d4712a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamecreated_atupdated_atidentifier
02ens-écoles2021-05-07 15:24:19.808501+02:002021-05-07 15:24:19.808501+02:00b6a360c5f84595940c5774f13fd39cc3
11NaN2021-05-07 15:24:19.805589+02:002021-05-07 15:24:19.805589+02:00d41d8cd98f00b204e9800998ecf8427e
24ecoles primaires rennes2021-05-07 15:29:06.388415+02:002021-05-07 15:29:06.388415+02:00ca8649dd64c240d118f60b07d11a7053
35Angers Nantes Opéra2023-01-27 15:59:58.187557+01:002023-01-27 15:59:58.187557+01:00f8f500f937fe312542399299cdc13f7e
46Opéras2023-01-27 16:03:59.654938+01:002023-01-27 16:03:59.654938+01:0022eb2c616983ec7b54a093f84b230505
57Ministère de la Culture2023-01-30 11:22:29.636813+01:002023-01-30 11:22:29.636813+01:001b8c5c08fde000d90905a3d14af7763d
68Orchestres2023-01-30 11:33:56.392799+01:002023-01-30 11:33:56.392799+01:007c2aee0c80642d7e325a450f2dec45e5
79Cooperative2023-01-31 14:44:38.471146+01:002023-01-31 14:44:38.471146+01:006c88c36ffaab88d255865aa3111d7686
810Théâtres2023-01-31 14:45:17.804428+01:002023-01-31 14:45:17.804428+01:00b2c19672df82021702b79482c8cda85a
911La co[opera]tive2023-02-16 17:11:35.004478+01:002023-02-16 17:11:35.004478+01:005dbaa3a1f278c0fcf981d447ad20957a
1012Ville de Rennes2023-02-16 17:37:13.816196+01:002023-02-16 17:37:13.816196+01:00bc483d04d9c3a08f167a3ce64366ca72
1113Ensembles en résidence2023-02-16 17:55:54.877374+01:002023-02-16 17:55:54.877374+01:00e70635e771de13268dccf02bb2abfaf9
1214Ministère2023-02-17 11:17:54.429462+01:002023-02-17 11:17:54.429462+01:00a3f0582853fd19f5b57e3651f8a20e7a
1315Rennes métropole2023-02-17 11:53:24.490786+01:002023-02-17 11:53:24.490786+01:00e98b8db5941b96c29c353b6f2f502055
1416Ville de Rennes - équipements culturels2023-02-17 12:00:10.649104+01:002023-02-17 12:00:10.649104+01:00a44edffc7edb852982efa7f4aa6d0e25
1517Structures culturelles rennaises2023-02-17 12:05:55.583016+01:002023-02-17 12:05:55.583016+01:00241550517e4e3b1c926e9aeab0f621cd
1618Université Rennes 22023-02-17 14:23:44.832959+01:002023-02-17 14:23:44.832959+01:004057c5cee51c4e10aa819f0cf48adc3f
1719Centres chorégraphiques nationaux2023-02-17 15:29:41.827321+01:002023-02-17 15:29:41.827321+01:0041e75941dfb766365498d917abe0102f
1820Télévision2023-02-17 15:46:13.746092+01:002023-02-17 15:46:13.746092+01:0036d6409c539dd79c1f3af8c5948603eb
1921structures culturelles nationales2023-02-17 15:56:00.555722+01:002023-02-17 15:56:00.555722+01:005311cf7e42aac53289e1c4a338d5cfa4
\n", - "
" - ], - "text/plain": [ - " id name \\\n", - "0 2 ens-écoles \n", - "1 1 NaN \n", - "2 4 ecoles primaires rennes \n", - "3 5 Angers Nantes Opéra \n", - "4 6 Opéras \n", - "5 7 Ministère de la Culture \n", - "6 8 Orchestres \n", - "7 9 Cooperative \n", - "8 10 Théâtres \n", - "9 11 La co[opera]tive \n", - "10 12 Ville de Rennes \n", - "11 13 Ensembles en résidence \n", - "12 14 Ministère \n", - "13 15 Rennes métropole \n", - "14 16 Ville de Rennes - équipements culturels \n", - "15 17 Structures culturelles rennaises \n", - "16 18 Université Rennes 2 \n", - "17 19 Centres chorégraphiques nationaux \n", - "18 20 Télévision \n", - "19 21 structures culturelles nationales \n", - "\n", - " created_at updated_at \\\n", - "0 2021-05-07 15:24:19.808501+02:00 2021-05-07 15:24:19.808501+02:00 \n", - "1 2021-05-07 15:24:19.805589+02:00 2021-05-07 15:24:19.805589+02:00 \n", - "2 2021-05-07 15:29:06.388415+02:00 2021-05-07 15:29:06.388415+02:00 \n", - "3 2023-01-27 15:59:58.187557+01:00 2023-01-27 15:59:58.187557+01:00 \n", - "4 2023-01-27 16:03:59.654938+01:00 2023-01-27 16:03:59.654938+01:00 \n", - "5 2023-01-30 11:22:29.636813+01:00 2023-01-30 11:22:29.636813+01:00 \n", - "6 2023-01-30 11:33:56.392799+01:00 2023-01-30 11:33:56.392799+01:00 \n", - "7 2023-01-31 14:44:38.471146+01:00 2023-01-31 14:44:38.471146+01:00 \n", - "8 2023-01-31 14:45:17.804428+01:00 2023-01-31 14:45:17.804428+01:00 \n", - "9 2023-02-16 17:11:35.004478+01:00 2023-02-16 17:11:35.004478+01:00 \n", - "10 2023-02-16 17:37:13.816196+01:00 2023-02-16 17:37:13.816196+01:00 \n", - "11 2023-02-16 17:55:54.877374+01:00 2023-02-16 17:55:54.877374+01:00 \n", - "12 2023-02-17 11:17:54.429462+01:00 2023-02-17 11:17:54.429462+01:00 \n", - "13 2023-02-17 11:53:24.490786+01:00 2023-02-17 11:53:24.490786+01:00 \n", - "14 2023-02-17 12:00:10.649104+01:00 2023-02-17 12:00:10.649104+01:00 \n", - "15 2023-02-17 12:05:55.583016+01:00 2023-02-17 12:05:55.583016+01:00 \n", - "16 2023-02-17 14:23:44.832959+01:00 2023-02-17 14:23:44.832959+01:00 \n", - "17 2023-02-17 15:29:41.827321+01:00 2023-02-17 15:29:41.827321+01:00 \n", - "18 2023-02-17 15:46:13.746092+01:00 2023-02-17 15:46:13.746092+01:00 \n", - "19 2023-02-17 15:56:00.555722+01:00 2023-02-17 15:56:00.555722+01:00 \n", - "\n", - " identifier \n", - "0 b6a360c5f84595940c5774f13fd39cc3 \n", - "1 d41d8cd98f00b204e9800998ecf8427e \n", - "2 ca8649dd64c240d118f60b07d11a7053 \n", - "3 f8f500f937fe312542399299cdc13f7e \n", - "4 22eb2c616983ec7b54a093f84b230505 \n", - "5 1b8c5c08fde000d90905a3d14af7763d \n", - "6 7c2aee0c80642d7e325a450f2dec45e5 \n", - "7 6c88c36ffaab88d255865aa3111d7686 \n", - "8 b2c19672df82021702b79482c8cda85a \n", - "9 5dbaa3a1f278c0fcf981d447ad20957a \n", - "10 bc483d04d9c3a08f167a3ce64366ca72 \n", - "11 e70635e771de13268dccf02bb2abfaf9 \n", - "12 a3f0582853fd19f5b57e3651f8a20e7a \n", - "13 e98b8db5941b96c29c353b6f2f502055 \n", - "14 a44edffc7edb852982efa7f4aa6d0e25 \n", - "15 241550517e4e3b1c926e9aeab0f621cd \n", - "16 4057c5cee51c4e10aa819f0cf48adc3f \n", - "17 41e75941dfb766365498d917abe0102f \n", - "18 36d6409c539dd79c1f3af8c5948603eb \n", - "19 5311cf7e42aac53289e1c4a338d5cfa4 " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tags" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "4cc9f444-b7e6-4ee5-8ce8-64c63ab7825a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'structure_id', 'tag_id', 'created_at', 'updated_at'], dtype='object')\n", - "(179, 5)\n", - "\n", - "RangeIndex: 179 entries, 0 to 178\n", - "Data columns (total 5 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 179 non-null int64 \n", - " 1 structure_id 179 non-null int64 \n", - " 2 tag_id 179 non-null int64 \n", - " 3 created_at 179 non-null object\n", - " 4 updated_at 179 non-null object\n", - "dtypes: int64(3), object(2)\n", - "memory usage: 7.1+ KB\n" - ] - } - ], - "source": [ - "# Structure = clients\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(structure_tag_mappings.columns)\n", - "print(structure_tag_mappings.shape)\n", - "structure_tag_mappings.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "dcf776df-5c8e-4972-b2c1-b41291ba7e66", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idstructure_idtag_idcreated_atupdated_at
012318762023-01-27 16:03:59.680222+01:002023-01-27 16:03:59.680222+01:00
12222021-05-07 15:24:19.872895+02:002021-05-07 15:24:19.872895+02:00
23322021-05-07 15:24:19.873830+02:002021-05-07 15:24:19.873830+02:00
34422021-05-07 15:24:19.874628+02:002021-05-07 15:24:19.874628+02:00
45522021-05-07 15:24:19.875421+02:002021-05-07 15:24:19.875421+02:00
..................
174184236102023-02-17 16:35:25.041114+01:002023-02-17 16:35:25.041114+01:00
175185237172023-02-17 16:39:10.799478+01:002023-02-17 16:39:10.799478+01:00
176186238192023-02-17 16:53:21.098690+01:002023-02-17 16:53:21.098690+01:00
177187239102023-02-17 16:57:42.623481+01:002023-02-17 16:57:42.623481+01:00
178188240102023-02-17 16:59:22.067723+01:002023-02-17 16:59:22.067723+01:00
\n", - "

179 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " id structure_id tag_id created_at \\\n", - "0 123 187 6 2023-01-27 16:03:59.680222+01:00 \n", - "1 2 2 2 2021-05-07 15:24:19.872895+02:00 \n", - "2 3 3 2 2021-05-07 15:24:19.873830+02:00 \n", - "3 4 4 2 2021-05-07 15:24:19.874628+02:00 \n", - "4 5 5 2 2021-05-07 15:24:19.875421+02:00 \n", - ".. ... ... ... ... \n", - "174 184 236 10 2023-02-17 16:35:25.041114+01:00 \n", - "175 185 237 17 2023-02-17 16:39:10.799478+01:00 \n", - "176 186 238 19 2023-02-17 16:53:21.098690+01:00 \n", - "177 187 239 10 2023-02-17 16:57:42.623481+01:00 \n", - "178 188 240 10 2023-02-17 16:59:22.067723+01:00 \n", - "\n", - " updated_at \n", - "0 2023-01-27 16:03:59.680222+01:00 \n", - "1 2021-05-07 15:24:19.872895+02:00 \n", - "2 2021-05-07 15:24:19.873830+02:00 \n", - "3 2021-05-07 15:24:19.874628+02:00 \n", - "4 2021-05-07 15:24:19.875421+02:00 \n", - ".. ... \n", - "174 2023-02-17 16:35:25.041114+01:00 \n", - "175 2023-02-17 16:39:10.799478+01:00 \n", - "176 2023-02-17 16:53:21.098690+01:00 \n", - "177 2023-02-17 16:57:42.623481+01:00 \n", - "178 2023-02-17 16:59:22.067723+01:00 \n", - "\n", - "[179 rows x 5 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "structure_tag_mappings" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "41bf1529-5a7c-409e-9791-2024c08c11f0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", - " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", - " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", - " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", - " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", - " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", - " 'average_purchase_delay', 'average_price_basket',\n", - " 'average_ticket_basket', 'total_price', 'preferred_category',\n", - " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", - " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", - " 'tenant_id'],\n", - " dtype='object')\n", - "(71307, 43)\n", - "\n", - "RangeIndex: 71307 entries, 0 to 71306\n", - "Data columns (total 43 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 71307 non-null int64 \n", - " 1 lastname 41045 non-null object \n", - " 2 firstname 39140 non-null object \n", - " 3 birthdate 18174 non-null object \n", - " 4 email 58203 non-null object \n", - " 5 street_id 71307 non-null int64 \n", - " 6 created_at 71307 non-null object \n", - " 7 updated_at 71307 non-null object \n", - " 8 civility 0 non-null float64\n", - " 9 is_partner 71307 non-null bool \n", - " 10 extra 0 non-null float64\n", - " 11 deleted_at 0 non-null float64\n", - " 12 reference 0 non-null float64\n", - " 13 gender 71307 non-null int64 \n", - " 14 is_email_true 71307 non-null bool \n", - " 15 extra_field 0 non-null float64\n", - " 16 identifier 71307 non-null object \n", - " 17 opt_in 71307 non-null bool \n", - " 18 structure_id 616 non-null float64\n", - " 19 note 451 non-null object \n", - " 20 profession 812 non-null object \n", - " 21 language 0 non-null float64\n", - " 22 mcp_contact_id 22417 non-null float64\n", - " 23 need_reload 71307 non-null bool \n", - " 24 last_buying_date 34040 non-null object \n", - " 25 max_price 34040 non-null float64\n", - " 26 ticket_sum 71307 non-null int64 \n", - " 27 average_price 68694 non-null float64\n", - " 28 fidelity 71307 non-null int64 \n", - " 29 average_purchase_delay 34040 non-null float64\n", - " 30 average_price_basket 34040 non-null float64\n", - " 31 average_ticket_basket 34040 non-null float64\n", - " 32 total_price 36653 non-null float64\n", - " 33 preferred_category 0 non-null float64\n", - " 34 preferred_supplier 0 non-null float64\n", - " 35 preferred_formula 0 non-null float64\n", - " 36 purchase_count 71307 non-null int64 \n", - " 37 first_buying_date 34040 non-null object \n", - " 38 last_visiting_date 0 non-null float64\n", - " 39 zipcode 33756 non-null object \n", - " 40 country 39910 non-null object \n", - " 41 age 18174 non-null float64\n", - " 42 tenant_id 71307 non-null int64 \n", - "dtypes: bool(4), float64(19), int64(7), object(13)\n", - "memory usage: 21.5+ MB\n" - ] - } - ], - "source": [ - "# Tags = clients\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " customersplus = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(customersplus.columns)\n", - "print(customersplus.shape)\n", - "customersplus.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "948a0b2b-8d1c-4afb-802e-670d67dd8c20", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idlastnamefirstnamebirthdateemailstreet_idcreated_atupdated_atcivilityis_partner...preferred_categorypreferred_supplierpreferred_formulapurchase_countfirst_buying_datelast_visiting_datezipcodecountryagetenant_id
0286834lastname286834firstname286834NaNemail28683462022-05-19 10:09:09.361137+02:002022-05-19 10:09:09.361137+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNfrNaN1556
1330695NaNNaNNaNemail33069512022-07-16 04:10:34.135134+02:002022-07-16 04:10:34.156704+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
2330978NaNNaNNaNemail33097812022-07-21 22:14:09.811721+02:002022-07-21 22:14:09.836051+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
3338697NaNNaNNaNemail33869712022-09-15 19:02:03.950536+02:002022-09-15 19:02:03.985642+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
4338726NaNNaNNaNemail33872612022-09-16 01:24:40.719882+02:002022-09-16 01:24:40.742753+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
..................................................................
7130227105lastname27105firstname271051957-01-26email271052050242021-04-22 15:12:59.986534+02:002023-09-12 18:59:31.613235+02:00NaNFalse...NaNNaNNaN22018-12-31 18:56:57+01:00NaN35700fr66.01556
7130327108lastname27108firstname27108NaNNaN2050242021-04-22 15:12:59.989197+02:002023-09-12 18:27:34.380843+02:00NaNFalse...NaNNaNNaN62015-12-29 14:51:46+01:00NaN35700frNaN1556
7130427110lastname27110firstname27110NaNNaN62021-04-22 15:12:59.991029+02:002022-04-14 11:41:33.738500+02:00NaNFalse...NaNNaNNaN12018-12-31 19:12:59+01:00NaNNaNfrNaN1556
7130510607lastname10607firstname106071963-01-04email106073133322021-04-22 14:56:45.742226+02:002023-09-12 17:55:17.723195+02:00NaNFalse...NaNNaNNaN262015-10-10 14:11:21+02:00NaN35850fr60.01556
7130619095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...NaNNaNNaN22019-05-19 21:18:36+02:00NaNNaNfr44.01556
\n", - "

71307 rows × 43 columns

\n", - "
" - ], - "text/plain": [ - " id lastname firstname birthdate email \\\n", - "0 286834 lastname286834 firstname286834 NaN email286834 \n", - "1 330695 NaN NaN NaN email330695 \n", - "2 330978 NaN NaN NaN email330978 \n", - "3 338697 NaN NaN NaN email338697 \n", - "4 338726 NaN NaN NaN email338726 \n", - "... ... ... ... ... ... \n", - "71302 27105 lastname27105 firstname27105 1957-01-26 email27105 \n", - "71303 27108 lastname27108 firstname27108 NaN NaN \n", - "71304 27110 lastname27110 firstname27110 NaN NaN \n", - "71305 10607 lastname10607 firstname10607 1963-01-04 email10607 \n", - "71306 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", - "\n", - " street_id created_at \\\n", - "0 6 2022-05-19 10:09:09.361137+02:00 \n", - "1 1 2022-07-16 04:10:34.135134+02:00 \n", - "2 1 2022-07-21 22:14:09.811721+02:00 \n", - "3 1 2022-09-15 19:02:03.950536+02:00 \n", - "4 1 2022-09-16 01:24:40.719882+02:00 \n", - "... ... ... \n", - "71302 205024 2021-04-22 15:12:59.986534+02:00 \n", - "71303 205024 2021-04-22 15:12:59.989197+02:00 \n", - "71304 6 2021-04-22 15:12:59.991029+02:00 \n", - "71305 313332 2021-04-22 14:56:45.742226+02:00 \n", - "71306 6 2021-04-22 15:06:30.120537+02:00 \n", - "\n", - " updated_at civility is_partner ... \\\n", - "0 2022-05-19 10:09:09.361137+02:00 NaN False ... \n", - "1 2022-07-16 04:10:34.156704+02:00 NaN False ... \n", - "2 2022-07-21 22:14:09.836051+02:00 NaN False ... \n", - "3 2022-09-15 19:02:03.985642+02:00 NaN False ... \n", - "4 2022-09-16 01:24:40.742753+02:00 NaN False ... \n", - "... ... ... ... ... \n", - "71302 2023-09-12 18:59:31.613235+02:00 NaN False ... \n", - "71303 2023-09-12 18:27:34.380843+02:00 NaN False ... \n", - "71304 2022-04-14 11:41:33.738500+02:00 NaN False ... \n", - "71305 2023-09-12 17:55:17.723195+02:00 NaN False ... \n", - "71306 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", - "\n", - " preferred_category preferred_supplier preferred_formula \\\n", - "0 NaN NaN NaN \n", - "1 NaN NaN NaN \n", - "2 NaN NaN NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "... ... ... ... \n", - "71302 NaN NaN NaN \n", - "71303 NaN NaN NaN \n", - "71304 NaN NaN NaN \n", - "71305 NaN NaN NaN \n", - "71306 NaN NaN NaN \n", - "\n", - " purchase_count first_buying_date last_visiting_date zipcode \\\n", - "0 0 NaN NaN NaN \n", - "1 0 NaN NaN NaN \n", - "2 0 NaN NaN NaN \n", - "3 0 NaN NaN NaN \n", - "4 0 NaN NaN NaN \n", - "... ... ... ... ... \n", - "71302 2 2018-12-31 18:56:57+01:00 NaN 35700 \n", - "71303 6 2015-12-29 14:51:46+01:00 NaN 35700 \n", - "71304 1 2018-12-31 19:12:59+01:00 NaN NaN \n", - "71305 26 2015-10-10 14:11:21+02:00 NaN 35850 \n", - "71306 2 2019-05-19 21:18:36+02:00 NaN NaN \n", - "\n", - " country age tenant_id \n", - "0 fr NaN 1556 \n", - "1 NaN NaN 1556 \n", - "2 NaN NaN 1556 \n", - "3 NaN NaN 1556 \n", - "4 NaN NaN 1556 \n", - "... ... ... ... \n", - "71302 fr 66.0 1556 \n", - "71303 fr NaN 1556 \n", - "71304 fr NaN 1556 \n", - "71305 fr 60.0 1556 \n", - "71306 fr 44.0 1556 \n", - "\n", - "[71307 rows x 43 columns]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "customersplus" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c40c44a0-e7c2-4ad1-b700-0d6ea05d62b2", - "metadata": {}, - "outputs": [], - "source": [ - "# But : lier les caractéristiques socio-demo et les comportements d'achat\n" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "8259ae6c-353f-43a6-add3-f974fac6e5d4", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'number', 'created_at', 'updated_at', 'purchase_id', 'product_id',\n", - " 'is_from_subscription', 'type_of', 'supplier_id', 'barcode',\n", - " 'identifier'],\n", - " dtype='object')\n", - "(318969, 11)\n", - "\n", - "RangeIndex: 318969 entries, 0 to 318968\n", - "Data columns (total 11 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 318969 non-null int64 \n", - " 1 number 318969 non-null object \n", - " 2 created_at 318969 non-null object \n", - " 3 updated_at 318969 non-null object \n", - " 4 purchase_id 318969 non-null int64 \n", - " 5 product_id 318969 non-null int64 \n", - " 6 is_from_subscription 318969 non-null bool \n", - " 7 type_of 318969 non-null int64 \n", - " 8 supplier_id 318969 non-null int64 \n", - " 9 barcode 0 non-null float64\n", - " 10 identifier 318969 non-null object \n", - "dtypes: bool(1), float64(1), int64(5), object(4)\n", - "memory usage: 24.6+ MB\n" - ] - } - ], - "source": [ - "# tickets\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " tickets = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(tickets.columns)\n", - "print(tickets.shape)\n", - "tickets.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "f54830cb-1f95-4f71-9b04-358c745fb454", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnumbercreated_atupdated_atpurchase_idproduct_idis_from_subscriptiontype_ofsupplier_idbarcodeidentifier
021190811433_136_212_683562023-09-12 17:42:45.396336+02:002023-09-12 17:42:45.396336+02:00861764209879False11702NaNf694c255855ce5643c6fcc7fed5e9237
121190821433_136_194_683562023-09-12 17:42:45.409056+02:002023-09-12 17:42:45.409056+02:00861763209879False11702NaN838d6101db2fc8bc80536d8b91b49859
2211908333158_158_343_683572023-09-12 17:42:45.409824+02:002023-09-12 17:42:45.409824+02:00861769209880False11702NaN8a8d938d66a4dc57bcb44c2773c6fdfa
3211908433158_158_297_683572023-09-12 17:42:45.410447+02:002023-09-12 17:42:45.410447+02:00861767209880False11702NaNb7a3dd0794c0957c942d45b8913e5b96
4211908533158_158_318_683572023-09-12 17:42:45.411059+02:002023-09-12 17:42:45.411059+02:00861768209880False11702NaNd7ea7e443581ebe520dd13f6cad31af7
....................................
318964256402144247_204_239_892782023-09-12 18:59:48.750953+02:002023-09-12 18:59:48.750953+02:001244281210158False11702NaN82c9af8b2167f7ac34a5e834242b0239
318965256402244247_204_299_892782023-09-12 18:59:48.751441+02:002023-09-12 18:59:48.751441+02:001244284210158False11702NaN235e8e608f066cb72949bbd397d0a76f
318966256402344247_204_259_892782023-09-12 18:59:48.751924+02:002023-09-12 18:59:48.751924+02:001244282210158False11702NaNec22fa828931f030f7e79a4cc5478c4b
318967256402444247_204_279_892782023-09-12 18:59:48.752425+02:002023-09-12 18:59:48.752425+02:001244283210158False11702NaN31ec4deaf718e04caf193e1ff8d621ef
31896825131564854_178_2847_891702023-09-12 18:52:20.331807+02:002023-09-12 18:59:48.752904+02:001244285261922False31702NaN48aef9efab29bfb1537656908863bcc1
\n", - "

318969 rows × 11 columns

\n", - "
" - ], - "text/plain": [ - " id number created_at \\\n", - "0 2119081 1433_136_212_68356 2023-09-12 17:42:45.396336+02:00 \n", - "1 2119082 1433_136_194_68356 2023-09-12 17:42:45.409056+02:00 \n", - "2 2119083 33158_158_343_68357 2023-09-12 17:42:45.409824+02:00 \n", - "3 2119084 33158_158_297_68357 2023-09-12 17:42:45.410447+02:00 \n", - "4 2119085 33158_158_318_68357 2023-09-12 17:42:45.411059+02:00 \n", - "... ... ... ... \n", - "318964 2564021 44247_204_239_89278 2023-09-12 18:59:48.750953+02:00 \n", - "318965 2564022 44247_204_299_89278 2023-09-12 18:59:48.751441+02:00 \n", - "318966 2564023 44247_204_259_89278 2023-09-12 18:59:48.751924+02:00 \n", - "318967 2564024 44247_204_279_89278 2023-09-12 18:59:48.752425+02:00 \n", - "318968 2513156 4854_178_2847_89170 2023-09-12 18:52:20.331807+02:00 \n", - "\n", - " updated_at purchase_id product_id \\\n", - "0 2023-09-12 17:42:45.396336+02:00 861764 209879 \n", - "1 2023-09-12 17:42:45.409056+02:00 861763 209879 \n", - "2 2023-09-12 17:42:45.409824+02:00 861769 209880 \n", - "3 2023-09-12 17:42:45.410447+02:00 861767 209880 \n", - "4 2023-09-12 17:42:45.411059+02:00 861768 209880 \n", - "... ... ... ... \n", - "318964 2023-09-12 18:59:48.750953+02:00 1244281 210158 \n", - "318965 2023-09-12 18:59:48.751441+02:00 1244284 210158 \n", - "318966 2023-09-12 18:59:48.751924+02:00 1244282 210158 \n", - "318967 2023-09-12 18:59:48.752425+02:00 1244283 210158 \n", - "318968 2023-09-12 18:59:48.752904+02:00 1244285 261922 \n", - "\n", - " is_from_subscription type_of supplier_id barcode \\\n", - "0 False 1 1702 NaN \n", - "1 False 1 1702 NaN \n", - "2 False 1 1702 NaN \n", - "3 False 1 1702 NaN \n", - "4 False 1 1702 NaN \n", - "... ... ... ... ... \n", - "318964 False 1 1702 NaN \n", - "318965 False 1 1702 NaN \n", - "318966 False 1 1702 NaN \n", - "318967 False 1 1702 NaN \n", - "318968 False 3 1702 NaN \n", - "\n", - " identifier \n", - "0 f694c255855ce5643c6fcc7fed5e9237 \n", - "1 838d6101db2fc8bc80536d8b91b49859 \n", - "2 8a8d938d66a4dc57bcb44c2773c6fdfa \n", - "3 b7a3dd0794c0957c942d45b8913e5b96 \n", - "4 d7ea7e443581ebe520dd13f6cad31af7 \n", - "... ... \n", - "318964 82c9af8b2167f7ac34a5e834242b0239 \n", - "318965 235e8e608f066cb72949bbd397d0a76f \n", - "318966 ec22fa828931f030f7e79a4cc5478c4b \n", - "318967 31ec4deaf718e04caf193e1ff8d621ef \n", - "318968 48aef9efab29bfb1537656908863bcc1 \n", - "\n", - "[318969 rows x 11 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tickets" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "ad743347-33d1-41f0-852d-f9e6354f82ed", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([1, 3, 0])" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tickets['type_of'].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "b88808fe-3b4e-49ed-9885-d52910b6f211", - "metadata": {}, - "source": [ - "## Types d'évenement et client" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "ecb03a47-1418-4fb1-8c78-cd222d38b7fd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'created_at', 'updated_at', 'season_id', 'facility_id', 'name',\n", - " 'event_type_id', 'manual_added', 'is_display', 'event_type_key_id',\n", - " 'facility_key_id', 'identifier'],\n", - " dtype='object')\n", - "(403, 12)\n", - "\n", - "RangeIndex: 403 entries, 0 to 402\n", - "Data columns (total 12 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 403 non-null int64 \n", - " 1 created_at 403 non-null object\n", - " 2 updated_at 403 non-null object\n", - " 3 season_id 403 non-null int64 \n", - " 4 facility_id 403 non-null int64 \n", - " 5 name 403 non-null object\n", - " 6 event_type_id 403 non-null int64 \n", - " 7 manual_added 403 non-null bool \n", - " 8 is_display 403 non-null bool \n", - " 9 event_type_key_id 403 non-null int64 \n", - " 10 facility_key_id 403 non-null int64 \n", - " 11 identifier 403 non-null object\n", - "dtypes: bool(2), int64(6), object(4)\n", - "memory usage: 32.4+ KB\n" - ] - } - ], - "source": [ - "# Evenement = events.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " events = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(events.columns)\n", - "print(events.shape)\n", - "events.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "19706610-9e90-4e6f-8bd0-da124b87cff7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idcreated_atupdated_atseason_idfacility_idnameevent_type_idmanual_addedis_displayevent_type_key_idfacility_key_ididentifier
0203672023-09-13 03:42:45.214293+02:002023-09-13 03:54:30.086969+02:0018651054marelle1055FalseTrue1055105426d1e9a4acad18b9cf79244334c86c93
1203712023-09-13 03:42:45.218728+02:002023-09-13 03:54:30.103943+02:0018651054dialogues1055FalseTrue1055105460356fc5e8ed6c9c1be9c5ec67e77766
2205702023-10-05 04:48:29.374504+02:002023-10-05 04:48:36.562528+02:0018651054les grandes epopees1055FalseTrue10551054f8ab088e06252bf34e1b12ad2ce1a403
3207572023-11-01 03:55:20.846196+01:002023-11-01 03:55:28.412457+01:0018651054scolaire marelle1055FalseTrue10551054447fa80f9a793b7587bb85ebbda6442c
4203642023-09-13 03:42:45.196791+02:002023-09-13 03:54:30.075456+02:0018651054le couronnement de poppee1055FalseTrue105510543b37f5d2cd354cbc422868621ac7ebc2
.......................................
398156032023-09-12 17:42:25.327618+02:002023-09-12 19:00:00.893400+02:0017061054marelle1055FalseTrue10551054fde88b72fb82b1fe42fbbfbfc3d6b4d3
399156212023-09-12 17:42:25.335792+02:002023-09-12 19:00:00.899622+02:0017081054cartes d'adhesion1055FalseTrue10551054051b96aad2b720bad4450a59ed7dfbf6
400157402023-09-12 17:47:05.112101+02:002023-09-12 19:00:00.906123+02:0017111054repetition le medecin malgre lui1055FalseTrue10551054addd6885bea5ddf60ec3539dfc3e79e8
401155202023-09-12 17:42:25.290280+02:002023-09-12 19:00:00.835625+02:0017081054opera au village1055FalseTrue1055105494f250d10d4a56358ceab23b384439ff
402154392023-09-12 17:42:25.252747+02:002023-09-12 19:00:00.735990+02:0017081054florilege1055FalseTrue105510544f015946bcbd856aa573cadb7ac42b9f
\n", - "

403 rows × 12 columns

\n", - "
" - ], - "text/plain": [ - " id created_at \\\n", - "0 20367 2023-09-13 03:42:45.214293+02:00 \n", - "1 20371 2023-09-13 03:42:45.218728+02:00 \n", - "2 20570 2023-10-05 04:48:29.374504+02:00 \n", - "3 20757 2023-11-01 03:55:20.846196+01:00 \n", - "4 20364 2023-09-13 03:42:45.196791+02:00 \n", - ".. ... ... \n", - "398 15603 2023-09-12 17:42:25.327618+02:00 \n", - "399 15621 2023-09-12 17:42:25.335792+02:00 \n", - "400 15740 2023-09-12 17:47:05.112101+02:00 \n", - "401 15520 2023-09-12 17:42:25.290280+02:00 \n", - "402 15439 2023-09-12 17:42:25.252747+02:00 \n", - "\n", - " updated_at season_id facility_id \\\n", - "0 2023-09-13 03:54:30.086969+02:00 1865 1054 \n", - "1 2023-09-13 03:54:30.103943+02:00 1865 1054 \n", - "2 2023-10-05 04:48:36.562528+02:00 1865 1054 \n", - "3 2023-11-01 03:55:28.412457+01:00 1865 1054 \n", - "4 2023-09-13 03:54:30.075456+02:00 1865 1054 \n", - ".. ... ... ... \n", - "398 2023-09-12 19:00:00.893400+02:00 1706 1054 \n", - "399 2023-09-12 19:00:00.899622+02:00 1708 1054 \n", - "400 2023-09-12 19:00:00.906123+02:00 1711 1054 \n", - "401 2023-09-12 19:00:00.835625+02:00 1708 1054 \n", - "402 2023-09-12 19:00:00.735990+02:00 1708 1054 \n", - "\n", - " name event_type_id manual_added \\\n", - "0 marelle 1055 False \n", - "1 dialogues 1055 False \n", - "2 les grandes epopees 1055 False \n", - "3 scolaire marelle 1055 False \n", - "4 le couronnement de poppee 1055 False \n", - ".. ... ... ... \n", - "398 marelle 1055 False \n", - "399 cartes d'adhesion 1055 False \n", - "400 repetition le medecin malgre lui 1055 False \n", - "401 opera au village 1055 False \n", - "402 florilege 1055 False \n", - "\n", - " is_display event_type_key_id facility_key_id \\\n", - "0 True 1055 1054 \n", - "1 True 1055 1054 \n", - "2 True 1055 1054 \n", - "3 True 1055 1054 \n", - "4 True 1055 1054 \n", - ".. ... ... ... \n", - "398 True 1055 1054 \n", - "399 True 1055 1054 \n", - "400 True 1055 1054 \n", - "401 True 1055 1054 \n", - "402 True 1055 1054 \n", - "\n", - " identifier \n", - "0 26d1e9a4acad18b9cf79244334c86c93 \n", - "1 60356fc5e8ed6c9c1be9c5ec67e77766 \n", - "2 f8ab088e06252bf34e1b12ad2ce1a403 \n", - "3 447fa80f9a793b7587bb85ebbda6442c \n", - "4 3b37f5d2cd354cbc422868621ac7ebc2 \n", - ".. ... \n", - "398 fde88b72fb82b1fe42fbbfbfc3d6b4d3 \n", - "399 051b96aad2b720bad4450a59ed7dfbf6 \n", - "400 addd6885bea5ddf60ec3539dfc3e79e8 \n", - "401 94f250d10d4a56358ceab23b384439ff \n", - "402 4f015946bcbd856aa573cadb7ac42b9f \n", - "\n", - "[403 rows x 12 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "6cb04679-26e7-4ed8-bfc1-42285da96374", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "357" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events['name'].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "c10297e8-a8f9-45f9-8553-17e3fdb6f8c1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'serial', 'event_id', 'created_at', 'updated_at',\n", - " 'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',\n", - " 'is_display', 'representation_type_id', 'expected_filling',\n", - " 'max_filling', 'extra_field', 'identifier'],\n", - " dtype='object')\n", - "(996, 16)\n", - "\n", - "RangeIndex: 996 entries, 0 to 995\n", - "Data columns (total 16 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 996 non-null int64 \n", - " 1 serial 0 non-null float64\n", - " 2 event_id 996 non-null int64 \n", - " 3 created_at 996 non-null object \n", - " 4 updated_at 996 non-null object \n", - " 5 start_date_time 996 non-null object \n", - " 6 open 996 non-null bool \n", - " 7 satisfaction 0 non-null float64\n", - " 8 end_date_time 996 non-null object \n", - " 9 name 0 non-null float64\n", - " 10 is_display 996 non-null bool \n", - " 11 representation_type_id 0 non-null float64\n", - " 12 expected_filling 24 non-null float64\n", - " 13 max_filling 24 non-null float64\n", - " 14 extra_field 0 non-null float64\n", - " 15 identifier 996 non-null object \n", - "dtypes: bool(2), float64(7), int64(2), object(5)\n", - "memory usage: 111.0+ KB\n" - ] - } - ], - "source": [ - "# Représentation des évenements = representations.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " representations = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(representations.columns)\n", - "print(representations.shape)\n", - "representations.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "41ef6a1b-e99e-4c73-a2ae-ba7d438d90c2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idserialevent_idcreated_atupdated_atstart_date_timeopensatisfactionend_date_timenameis_displayrepresentation_type_idexpected_fillingmax_fillingextra_fieldidentifier
044351NaN203712023-09-13 03:42:45.245879+02:002023-09-13 03:42:45.245879+02:002023-12-21 20:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaN550.0550.0NaN33520762e8cc28982e3841cbc2be8ce2
145497NaN207572023-11-01 03:55:20.875712+01:002023-11-01 03:55:20.875712+01:002023-11-28 10:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN5c34b84e3d11276e0995d984c94cd28d
244383NaN203832023-09-13 10:41:08.964302+02:002023-09-13 10:41:08.964302+02:002023-06-04 17:00:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaNbf3c65a1dfefbd747dcc2360e6887eac
344384NaN203832023-09-13 10:41:08.972401+02:002023-09-13 10:41:08.972401+02:002023-06-03 17:30:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaNb0e69ae8b78ebab3066aac83de22d239
444385NaN203842023-09-13 10:41:08.973290+02:002023-09-13 10:41:08.973290+02:002023-06-03 16:15:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN9fb91c8b1cf9e444111c511e212ac5c1
...................................................
99133894NaN156472023-09-12 17:42:25.564297+02:002023-09-12 17:42:25.564297+02:002022-11-08 20:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN44bbcecfd007ceaad05805391beccabb
99233873NaN156402023-09-12 17:42:25.554863+02:002023-09-12 17:42:25.554863+02:002022-11-14 20:00:00+01:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN151edbec8e0a3cd80071038e857f3493
99333610NaN155202023-09-12 17:42:25.442979+02:002023-09-12 17:42:25.442979+02:002023-06-19 18:00:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN9e9e38d527427e1b6f67e0c3f12b82fc
99433953NaN155202023-09-12 17:42:25.590746+02:002023-09-12 17:42:25.590746+02:002023-06-19 20:00:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaN7bf0978aabb6cac1bb4cd2784afb2b6b
99533639NaN155332023-09-12 17:42:25.455708+02:002023-09-12 17:42:25.455708+02:002023-04-15 17:30:00+02:00TrueNaN1901-01-01 00:09:21+00:09NaNTrueNaNNaNNaNNaNfae68f1e09710ec8747957af6e22f61d
\n", - "

996 rows × 16 columns

\n", - "
" - ], - "text/plain": [ - " id serial event_id created_at \\\n", - "0 44351 NaN 20371 2023-09-13 03:42:45.245879+02:00 \n", - "1 45497 NaN 20757 2023-11-01 03:55:20.875712+01:00 \n", - "2 44383 NaN 20383 2023-09-13 10:41:08.964302+02:00 \n", - "3 44384 NaN 20383 2023-09-13 10:41:08.972401+02:00 \n", - "4 44385 NaN 20384 2023-09-13 10:41:08.973290+02:00 \n", - ".. ... ... ... ... \n", - "991 33894 NaN 15647 2023-09-12 17:42:25.564297+02:00 \n", - "992 33873 NaN 15640 2023-09-12 17:42:25.554863+02:00 \n", - "993 33610 NaN 15520 2023-09-12 17:42:25.442979+02:00 \n", - "994 33953 NaN 15520 2023-09-12 17:42:25.590746+02:00 \n", - "995 33639 NaN 15533 2023-09-12 17:42:25.455708+02:00 \n", - "\n", - " updated_at start_date_time open \\\n", - "0 2023-09-13 03:42:45.245879+02:00 2023-12-21 20:00:00+01:00 True \n", - "1 2023-11-01 03:55:20.875712+01:00 2023-11-28 10:00:00+01:00 True \n", - "2 2023-09-13 10:41:08.964302+02:00 2023-06-04 17:00:00+02:00 True \n", - "3 2023-09-13 10:41:08.972401+02:00 2023-06-03 17:30:00+02:00 True \n", - "4 2023-09-13 10:41:08.973290+02:00 2023-06-03 16:15:00+02:00 True \n", - ".. ... ... ... \n", - "991 2023-09-12 17:42:25.564297+02:00 2022-11-08 20:00:00+01:00 True \n", - "992 2023-09-12 17:42:25.554863+02:00 2022-11-14 20:00:00+01:00 True \n", - "993 2023-09-12 17:42:25.442979+02:00 2023-06-19 18:00:00+02:00 True \n", - "994 2023-09-12 17:42:25.590746+02:00 2023-06-19 20:00:00+02:00 True \n", - "995 2023-09-12 17:42:25.455708+02:00 2023-04-15 17:30:00+02:00 True \n", - "\n", - " satisfaction end_date_time name is_display \\\n", - "0 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "1 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "2 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "3 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "4 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - ".. ... ... ... ... \n", - "991 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "992 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "993 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "994 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "995 NaN 1901-01-01 00:09:21+00:09 NaN True \n", - "\n", - " representation_type_id expected_filling max_filling extra_field \\\n", - "0 NaN 550.0 550.0 NaN \n", - "1 NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN \n", - ".. ... ... ... ... \n", - "991 NaN NaN NaN NaN \n", - "992 NaN NaN NaN NaN \n", - "993 NaN NaN NaN NaN \n", - "994 NaN NaN NaN NaN \n", - "995 NaN NaN NaN NaN \n", - "\n", - " identifier \n", - "0 33520762e8cc28982e3841cbc2be8ce2 \n", - "1 5c34b84e3d11276e0995d984c94cd28d \n", - "2 bf3c65a1dfefbd747dcc2360e6887eac \n", - "3 b0e69ae8b78ebab3066aac83de22d239 \n", - "4 9fb91c8b1cf9e444111c511e212ac5c1 \n", - ".. ... \n", - "991 44bbcecfd007ceaad05805391beccabb \n", - "992 151edbec8e0a3cd80071038e857f3493 \n", - "993 9e9e38d527427e1b6f67e0c3f12b82fc \n", - "994 7bf0978aabb6cac1bb4cd2784afb2b6b \n", - "995 fae68f1e09710ec8747957af6e22f61d \n", - "\n", - "[996 rows x 16 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "representations" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "ae6cdad3-2184-4ae7-928c-2f8bd7769a5b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'amount', 'is_full_price', 'representation_id',\n", - " 'pricing_formula_id', 'created_at', 'updated_at', 'category_id',\n", - " 'apply_price', 'products_group_id', 'product_pack_id', 'extra_field',\n", - " 'amount_consumption', 'identifier'],\n", - " dtype='object')\n", - "(14648, 14)\n", - "\n", - "RangeIndex: 14648 entries, 0 to 14647\n", - "Data columns (total 14 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 14648 non-null int64 \n", - " 1 amount 14648 non-null float64\n", - " 2 is_full_price 14648 non-null bool \n", - " 3 representation_id 14648 non-null int64 \n", - " 4 pricing_formula_id 14648 non-null int64 \n", - " 5 created_at 14648 non-null object \n", - " 6 updated_at 14648 non-null object \n", - " 7 category_id 14648 non-null int64 \n", - " 8 apply_price 14648 non-null float64\n", - " 9 products_group_id 14648 non-null int64 \n", - " 10 product_pack_id 14648 non-null int64 \n", - " 11 extra_field 0 non-null float64\n", - " 12 amount_consumption 0 non-null float64\n", - " 13 identifier 14648 non-null object \n", - "dtypes: bool(1), float64(4), int64(6), object(3)\n", - "memory usage: 1.5+ MB\n" - ] - } - ], - "source": [ - "# Produits vendues = products.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " products = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(products.columns)\n", - "print(products.shape)\n", - "products.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "34f1825d-148a-4a6e-88d6-61449fee3ee4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idamountis_full_pricerepresentation_idpricing_formula_idcreated_atupdated_atcategory_idapply_priceproducts_group_idproduct_pack_idextra_fieldamount_consumptionidentifier
026832518.0False44332204772023-09-13 03:42:45.415594+02:002023-09-13 03:42:45.415594+02:0049720.02681081NaNNaNb823bbea3ba837da2ef8efaf1287272d
127411836.8False44340205022023-10-25 03:26:57.430694+02:002023-10-25 03:26:57.430694+02:0049690.02739011NaNNaN81e8b7991f6948e3ef7cfe5011d13532
226833839.1False44340204972023-09-13 03:42:45.430942+02:002023-09-13 03:42:45.430942+02:0049690.02681211NaNNaNbe8bc0399db4d04aefa9f44afd4d5efa
32098830.0False33443204752023-09-12 17:42:27.595998+02:002023-09-12 17:42:27.595998+02:0049700.02097061NaNNaN01a9eea5f8ad53491faa864bfac44183
426832663.0False44333204772023-09-13 03:42:45.417283+02:002023-09-13 03:42:45.417283+02:0049690.02681091NaNNaN781a917ecfdabb14169701d7b143bbe4
.............................................
1464321787833.6False33919204892023-09-12 17:51:11.572882+02:002023-09-12 17:51:11.572882+02:0049710.02176951NaNNaN82bba69321466069411b3023343b44a4
1464426831510.0False33919205042023-09-12 18:59:29.995176+02:002023-09-12 18:59:29.995176+02:0049690.02680981NaNNaNeae56a8eb0a4315c5713b2053103d595
146452101485.0False33531204732023-09-12 17:42:27.733260+02:002023-09-12 17:42:27.733260+02:0049750.02099711NaNNaN449f86c1ef2b478d3389f7d0e27d0e6b
1464621205430.0False33810204732023-09-12 17:42:28.724681+02:002023-09-12 17:42:28.724681+02:0049720.02118761NaNNaN2090203e2c0b58ea8f505089faee6d62
1464726192221.0False33766204882023-09-12 18:52:00.519838+02:002023-09-12 18:52:00.519838+02:0049720.02617091NaNNaN9139ee36a92bed766ae95372cca77336
\n", - "

14648 rows × 14 columns

\n", - "
" - ], - "text/plain": [ - " id amount is_full_price representation_id pricing_formula_id \\\n", - "0 268325 18.0 False 44332 20477 \n", - "1 274118 36.8 False 44340 20502 \n", - "2 268338 39.1 False 44340 20497 \n", - "3 209883 0.0 False 33443 20475 \n", - "4 268326 63.0 False 44333 20477 \n", - "... ... ... ... ... ... \n", - "14643 217878 33.6 False 33919 20489 \n", - "14644 268315 10.0 False 33919 20504 \n", - "14645 210148 5.0 False 33531 20473 \n", - "14646 212054 30.0 False 33810 20473 \n", - "14647 261922 21.0 False 33766 20488 \n", - "\n", - " created_at updated_at \\\n", - "0 2023-09-13 03:42:45.415594+02:00 2023-09-13 03:42:45.415594+02:00 \n", - "1 2023-10-25 03:26:57.430694+02:00 2023-10-25 03:26:57.430694+02:00 \n", - "2 2023-09-13 03:42:45.430942+02:00 2023-09-13 03:42:45.430942+02:00 \n", - "3 2023-09-12 17:42:27.595998+02:00 2023-09-12 17:42:27.595998+02:00 \n", - "4 2023-09-13 03:42:45.417283+02:00 2023-09-13 03:42:45.417283+02:00 \n", - "... ... ... \n", - "14643 2023-09-12 17:51:11.572882+02:00 2023-09-12 17:51:11.572882+02:00 \n", - "14644 2023-09-12 18:59:29.995176+02:00 2023-09-12 18:59:29.995176+02:00 \n", - "14645 2023-09-12 17:42:27.733260+02:00 2023-09-12 17:42:27.733260+02:00 \n", - "14646 2023-09-12 17:42:28.724681+02:00 2023-09-12 17:42:28.724681+02:00 \n", - "14647 2023-09-12 18:52:00.519838+02:00 2023-09-12 18:52:00.519838+02:00 \n", - "\n", - " category_id apply_price products_group_id product_pack_id \\\n", - "0 4972 0.0 268108 1 \n", - "1 4969 0.0 273901 1 \n", - "2 4969 0.0 268121 1 \n", - "3 4970 0.0 209706 1 \n", - "4 4969 0.0 268109 1 \n", - "... ... ... ... ... \n", - "14643 4971 0.0 217695 1 \n", - "14644 4969 0.0 268098 1 \n", - "14645 4975 0.0 209971 1 \n", - "14646 4972 0.0 211876 1 \n", - "14647 4972 0.0 261709 1 \n", - "\n", - " extra_field amount_consumption identifier \n", - "0 NaN NaN b823bbea3ba837da2ef8efaf1287272d \n", - "1 NaN NaN 81e8b7991f6948e3ef7cfe5011d13532 \n", - "2 NaN NaN be8bc0399db4d04aefa9f44afd4d5efa \n", - "3 NaN NaN 01a9eea5f8ad53491faa864bfac44183 \n", - "4 NaN NaN 781a917ecfdabb14169701d7b143bbe4 \n", - "... ... ... ... \n", - "14643 NaN NaN 82bba69321466069411b3023343b44a4 \n", - "14644 NaN NaN eae56a8eb0a4315c5713b2053103d595 \n", - "14645 NaN NaN 449f86c1ef2b478d3389f7d0e27d0e6b \n", - "14646 NaN NaN 2090203e2c0b58ea8f505089faee6d62 \n", - "14647 NaN NaN 9139ee36a92bed766ae95372cca77336 \n", - "\n", - "[14648 rows x 14 columns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "products" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "6735b338-26b5-479d-825d-677ea533dad5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'name', 'created_at', 'updated_at', 'street_id', 'fixed_capacity',\n", - " 'identifier'],\n", - " dtype='object')\n", - "(1, 7)\n", - "\n", - "RangeIndex: 1 entries, 0 to 0\n", - "Data columns (total 7 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 1 non-null int64 \n", - " 1 name 0 non-null float64\n", - " 2 created_at 1 non-null object \n", - " 3 updated_at 1 non-null object \n", - " 4 street_id 1 non-null int64 \n", - " 5 fixed_capacity 0 non-null float64\n", - " 6 identifier 1 non-null object \n", - "dtypes: float64(2), int64(2), object(3)\n", - "memory usage: 184.0+ bytes\n" - ] - } - ], - "source": [ - "# Lieu = facilities.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " facilities = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(facilities.columns)\n", - "print(facilities.shape)\n", - "facilities.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "428b86c2-50f4-42a5-9bbb-a17ffe820bf9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamecreated_atupdated_atstreet_idfixed_capacityidentifier
01054NaN2023-09-12 17:42:25.223064+02:002023-09-12 17:42:25.223064+02:001NaNd41d8cd98f00b204e9800998ecf8427e
\n", - "
" - ], - "text/plain": [ - " id name created_at \\\n", - "0 1054 NaN 2023-09-12 17:42:25.223064+02:00 \n", - "\n", - " updated_at street_id fixed_capacity \\\n", - "0 2023-09-12 17:42:25.223064+02:00 1 NaN \n", - "\n", - " identifier \n", - "0 d41d8cd98f00b204e9800998ecf8427e " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "facilities" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "f6b26ad5-a4cc-4219-a0b0-406d9b025458", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'name', 'created_at', 'updated_at', 'start_date_time',\n", - " 'identifier'],\n", - " dtype='object')\n", - "(9, 6)\n", - "\n", - "RangeIndex: 9 entries, 0 to 8\n", - "Data columns (total 6 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 9 non-null int64 \n", - " 1 name 9 non-null object \n", - " 2 created_at 9 non-null object \n", - " 3 updated_at 9 non-null object \n", - " 4 start_date_time 0 non-null float64\n", - " 5 identifier 9 non-null object \n", - "dtypes: float64(1), int64(1), object(4)\n", - "memory usage: 560.0+ bytes\n" - ] - } - ], - "source": [ - "# Saisons = seasons.csv période sur deux années consécutives\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " seasons = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(seasons.columns)\n", - "print(seasons.shape)\n", - "seasons.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "75c8c0ef-4ff5-45b1-a791-8ba2e9a4437e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['saison 2023-2024', 'saison 2021-2022', 'saison 2015-2016',\n", - " 'saison 2016-2017', 'saison 2017-2018', 'saison 2018-2019',\n", - " 'saison 2020-2021', 'saison 2019-2020', 'saison 2022-2023'],\n", - " dtype=object)" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "seasons['name'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "cd0d10df-10cc-4f75-8b88-35f676c91f5b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Index(['id', 'purchase_date', 'customer_id', 'created_at', 'updated_at',\n", - " 'number', 'identifier'],\n", - " dtype='object')\n", - "(410695, 7)\n", - "\n", - "RangeIndex: 410695 entries, 0 to 410694\n", - "Data columns (total 7 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 410695 non-null int64 \n", - " 1 purchase_date 410695 non-null object \n", - " 2 customer_id 410695 non-null int64 \n", - " 3 created_at 410695 non-null object \n", - " 4 updated_at 410695 non-null object \n", - " 5 number 0 non-null float64\n", - " 6 identifier 410695 non-null object \n", - "dtypes: float64(1), int64(2), object(4)\n", - "memory usage: 21.9+ MB\n" - ] - } - ], - "source": [ - "# Achats = purchases.csv \n", - "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " purchases = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(purchases.columns)\n", - "print(purchases.shape)\n", - "purchases.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "8f986fdb-ca37-4cbb-b526-2a6d0ce7ca2c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idpurchase_datecustomer_idcreated_atupdated_atnumberidentifier
08617612019-03-01 16:28:49+01:0049662023-09-12 17:42:37.564150+02:002023-09-12 17:42:37.564150+02:00NaNd20eb0c3a7efec0bbe338dee40dc3378
18617622019-03-01 16:29:11+01:0049662023-09-12 17:42:37.571159+02:002023-09-12 17:42:37.571159+02:00NaNcff3abfc018517bce5ccfc58f5cacf40
28617632019-03-01 16:29:17+01:0049662023-09-12 17:42:37.571646+02:002023-09-12 17:42:37.571646+02:00NaNe1155cf26b34f792bdb23e49244d7264
38617642019-03-01 16:29:19+01:0049662023-09-12 17:42:37.572063+02:002023-09-12 17:42:37.572063+02:00NaNe8b95cc6a1a8b103ffa39755ce3bfc4d
48617652019-03-01 16:32:08+01:004059942023-09-12 17:42:37.572470+02:002023-09-12 17:42:37.572470+02:00NaN1b763278914f1309e357abe5033a3f0f
........................
41069012859642023-10-21 21:46:41+02:005173092023-10-23 03:43:16.457501+02:002023-10-23 03:43:16.457501+02:00NaN72c4e90c2b151dcffc87b19ea8a0c4f1
41069112859652023-10-21 21:47:07+02:005173092023-10-23 03:43:16.458458+02:002023-10-23 03:43:16.458458+02:00NaNee65532087132145daa6154fbae050ea
41069212859662023-10-21 21:47:20+02:005173092023-10-23 03:43:16.458811+02:002023-10-23 03:43:16.458811+02:00NaN7e825dd352bc6a11ab81cb8068e325e6
41069312859672023-10-21 23:07:06+02:003999692023-10-23 03:43:16.459738+02:002023-10-23 03:43:16.459738+02:00NaNfdb92627a48d6ba8fa817d60a83dbea8
41069412859682023-10-21 23:07:39+02:003999692023-10-23 03:43:16.462409+02:002023-10-23 03:43:16.462409+02:00NaNe9dbaff4f7037a5b0efa11263584dfad
\n", - "

410695 rows × 7 columns

\n", - "
" - ], - "text/plain": [ - " id purchase_date customer_id \\\n", - "0 861761 2019-03-01 16:28:49+01:00 4966 \n", - "1 861762 2019-03-01 16:29:11+01:00 4966 \n", - "2 861763 2019-03-01 16:29:17+01:00 4966 \n", - "3 861764 2019-03-01 16:29:19+01:00 4966 \n", - "4 861765 2019-03-01 16:32:08+01:00 405994 \n", - "... ... ... ... \n", - "410690 1285964 2023-10-21 21:46:41+02:00 517309 \n", - "410691 1285965 2023-10-21 21:47:07+02:00 517309 \n", - "410692 1285966 2023-10-21 21:47:20+02:00 517309 \n", - "410693 1285967 2023-10-21 23:07:06+02:00 399969 \n", - "410694 1285968 2023-10-21 23:07:39+02:00 399969 \n", - "\n", - " created_at updated_at \\\n", - "0 2023-09-12 17:42:37.564150+02:00 2023-09-12 17:42:37.564150+02:00 \n", - "1 2023-09-12 17:42:37.571159+02:00 2023-09-12 17:42:37.571159+02:00 \n", - "2 2023-09-12 17:42:37.571646+02:00 2023-09-12 17:42:37.571646+02:00 \n", - "3 2023-09-12 17:42:37.572063+02:00 2023-09-12 17:42:37.572063+02:00 \n", - "4 2023-09-12 17:42:37.572470+02:00 2023-09-12 17:42:37.572470+02:00 \n", - "... ... ... \n", - "410690 2023-10-23 03:43:16.457501+02:00 2023-10-23 03:43:16.457501+02:00 \n", - "410691 2023-10-23 03:43:16.458458+02:00 2023-10-23 03:43:16.458458+02:00 \n", - "410692 2023-10-23 03:43:16.458811+02:00 2023-10-23 03:43:16.458811+02:00 \n", - "410693 2023-10-23 03:43:16.459738+02:00 2023-10-23 03:43:16.459738+02:00 \n", - "410694 2023-10-23 03:43:16.462409+02:00 2023-10-23 03:43:16.462409+02:00 \n", - "\n", - " number identifier \n", - "0 NaN d20eb0c3a7efec0bbe338dee40dc3378 \n", - "1 NaN cff3abfc018517bce5ccfc58f5cacf40 \n", - "2 NaN e1155cf26b34f792bdb23e49244d7264 \n", - "3 NaN e8b95cc6a1a8b103ffa39755ce3bfc4d \n", - "4 NaN 1b763278914f1309e357abe5033a3f0f \n", - "... ... ... \n", - "410690 NaN 72c4e90c2b151dcffc87b19ea8a0c4f1 \n", - "410691 NaN ee65532087132145daa6154fbae050ea \n", - "410692 NaN 7e825dd352bc6a11ab81cb8068e325e6 \n", - "410693 NaN fdb92627a48d6ba8fa817d60a83dbea8 \n", - "410694 NaN e9dbaff4f7037a5b0efa11263584dfad \n", - "\n", - "[410695 rows x 7 columns]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "purchases" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb new file mode 100644 index 0000000..e1802cd --- /dev/null +++ b/Exploration_billet_AJ.ipynb @@ -0,0 +1,3406 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "56b3d44e-1e3f-4726-9916-0f9af107860e", + "metadata": {}, + "source": [ + "# Business Data Challenge - Team 1" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "15103481-8d74-404c-aa09-7601fe7730da", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import s3fs\n", + "import re" + ] + }, + { + "cell_type": "markdown", + "id": "c3bb0d13-34b2-4e1c-9985-468cd87c5a0e", + "metadata": {}, + "source": [ + "Configuration de l'accès aux données" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4", + "metadata": {}, + "outputs": [], + "source": [ + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" + ] + }, + { + "cell_type": "markdown", + "id": "f99da24f-0d93-4618-92bc-3ba81dc0445c", + "metadata": {}, + "source": [ + "# Exemple sur Company 1" + ] + }, + { + "cell_type": "markdown", + "id": "9d74b68f-ba07-4a15-9a27-dae931762d70", + "metadata": {}, + "source": [ + "## Chargement données" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "699664b9-eee4-4f8d-a207-e524526560c5", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET = \"bdc2324-data/1\"\n", + "liste_database = fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "aaf64d60-bf92-470c-8210-d09abd6a653e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1/1campaign_stats.csv',\n", + " 'bdc2324-data/1/1campaigns.csv',\n", + " 'bdc2324-data/1/1categories.csv',\n", + " 'bdc2324-data/1/1countries.csv',\n", + " 'bdc2324-data/1/1currencies.csv',\n", + " 'bdc2324-data/1/1customer_target_mappings.csv',\n", + " 'bdc2324-data/1/1customersplus.csv',\n", + " 'bdc2324-data/1/1event_types.csv',\n", + " 'bdc2324-data/1/1events.csv',\n", + " 'bdc2324-data/1/1facilities.csv',\n", + " 'bdc2324-data/1/1link_stats.csv',\n", + " 'bdc2324-data/1/1pricing_formulas.csv',\n", + " 'bdc2324-data/1/1product_packs.csv',\n", + " 'bdc2324-data/1/1products.csv',\n", + " 'bdc2324-data/1/1products_groups.csv',\n", + " 'bdc2324-data/1/1purchases.csv',\n", + " 'bdc2324-data/1/1representation_category_capacities.csv',\n", + " 'bdc2324-data/1/1representations.csv',\n", + " 'bdc2324-data/1/1seasons.csv',\n", + " 'bdc2324-data/1/1structure_tag_mappings.csv',\n", + " 'bdc2324-data/1/1suppliers.csv',\n", + " 'bdc2324-data/1/1tags.csv',\n", + " 'bdc2324-data/1/1target_types.csv',\n", + " 'bdc2324-data/1/1targets.csv',\n", + " 'bdc2324-data/1/1tickets.csv',\n", + " 'bdc2324-data/1/1type_of_categories.csv',\n", + " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n", + " 'bdc2324-data/1/1type_ofs.csv']" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "liste_database" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0cb92854-903b-4efd-ac1b-197e29f044b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1tickets.csv', 'bdc2324-data/1/1type_ofs.csv']\n" + ] + } + ], + "source": [ + "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n", + "\n", + "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", + "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n", + "\n", + "# Afficher le résultat\n", + "print(liste_database_filtered)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_9792/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(file_in)\n" + ] + } + ], + "source": [ + "# loop to create dataframes from liste\n", + "files_path = liste_database\n", + "\n", + "client_number = files_path[0].split(\"/\")[1]\n", + "df_prefix = \"df\" + str(client_number) + \"_\"\n", + "\n", + "for i in range(len(files_path)) :\n", + " current_path = files_path[i]\n", + " with fs.open(current_path, mode=\"rb\") as file_in:\n", + " df = pd.read_csv(file_in)\n", + " # the pattern of the name is df1xxx\n", + " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", + " globals()[nom_dataframe] = df" + ] + }, + { + "cell_type": "markdown", + "id": "e908f516-2a74-45d6-8492-7dcdc3afbe1f", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## tickets.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnumbercreated_atupdated_atpurchase_idproduct_idis_from_subscriptiontype_ofsupplier_idbarcodeidentifier
013070859135930026612882021-12-28 20:47:10.320641+01:002022-02-14 18:46:53.614229+01:005107462225251False13NaNb6ad7fc36f33b5e05f58c7fca06688a6
113070860135930026613992021-12-28 20:47:10.321037+01:002022-02-14 18:46:53.614761+01:005107462224914False13NaNb0903af480266f27802fe5c38c277c9e
213070861135930026614192021-12-28 20:47:10.321629+01:002022-02-14 18:46:53.615521+01:005107462224914False13NaN64ca12b7e26a65b90335c0702ea0faba
313070862135930026615082021-12-28 20:47:10.322029+01:002022-02-14 18:46:53.616000+01:005107462224914False13NaN5ac2f8150aa9f3a6b1599df08cc2f0c7
413070863135930026616892021-12-28 20:47:10.322449+01:002022-02-14 18:46:53.616447+01:005107462224914False13NaNdfe30081bae020d12094279926136b9c
....................................
182666720662815135930161543902023-11-09 07:51:34.935983+01:002023-11-09 07:51:34.935983+01:008007697405689False13NaNdba9aa428f843b79ae69dfacfe8fc579
182666820662816135930161545012023-11-09 07:51:34.937038+01:002023-11-09 07:51:34.937038+01:008007698403658False13NaN93f1fcfc6ba4fa68f92eb4b4a619fcf0
182666920662817135930161546802023-11-09 07:51:34.938224+01:002023-11-09 07:51:34.938224+01:008007698403658False13NaNc8bbbd25df2c158767ceef42c3237f23
182667020662818135930161548992023-11-09 07:51:34.939328+01:002023-11-09 07:51:34.939328+01:008007699403658False13NaN738f0a8b5088b5056bc3b32eff2dca1f
182667120662819135930161549882023-11-09 07:51:34.940680+01:002023-11-09 07:51:34.940680+01:008007699403658False13NaN4c5a6195434377380b4e6ae63b2e9cf6
\n", + "

1826672 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id number created_at \\\n", + "0 13070859 13593002661288 2021-12-28 20:47:10.320641+01:00 \n", + "1 13070860 13593002661399 2021-12-28 20:47:10.321037+01:00 \n", + "2 13070861 13593002661419 2021-12-28 20:47:10.321629+01:00 \n", + "3 13070862 13593002661508 2021-12-28 20:47:10.322029+01:00 \n", + "4 13070863 13593002661689 2021-12-28 20:47:10.322449+01:00 \n", + "... ... ... ... \n", + "1826667 20662815 13593016154390 2023-11-09 07:51:34.935983+01:00 \n", + "1826668 20662816 13593016154501 2023-11-09 07:51:34.937038+01:00 \n", + "1826669 20662817 13593016154680 2023-11-09 07:51:34.938224+01:00 \n", + "1826670 20662818 13593016154899 2023-11-09 07:51:34.939328+01:00 \n", + "1826671 20662819 13593016154988 2023-11-09 07:51:34.940680+01:00 \n", + "\n", + " updated_at purchase_id product_id \\\n", + "0 2022-02-14 18:46:53.614229+01:00 5107462 225251 \n", + "1 2022-02-14 18:46:53.614761+01:00 5107462 224914 \n", + "2 2022-02-14 18:46:53.615521+01:00 5107462 224914 \n", + "3 2022-02-14 18:46:53.616000+01:00 5107462 224914 \n", + "4 2022-02-14 18:46:53.616447+01:00 5107462 224914 \n", + "... ... ... ... \n", + "1826667 2023-11-09 07:51:34.935983+01:00 8007697 405689 \n", + "1826668 2023-11-09 07:51:34.937038+01:00 8007698 403658 \n", + "1826669 2023-11-09 07:51:34.938224+01:00 8007698 403658 \n", + "1826670 2023-11-09 07:51:34.939328+01:00 8007699 403658 \n", + "1826671 2023-11-09 07:51:34.940680+01:00 8007699 403658 \n", + "\n", + " is_from_subscription type_of supplier_id barcode \\\n", + "0 False 1 3 NaN \n", + "1 False 1 3 NaN \n", + "2 False 1 3 NaN \n", + "3 False 1 3 NaN \n", + "4 False 1 3 NaN \n", + "... ... ... ... ... \n", + "1826667 False 1 3 NaN \n", + "1826668 False 1 3 NaN \n", + "1826669 False 1 3 NaN \n", + "1826670 False 1 3 NaN \n", + "1826671 False 1 3 NaN \n", + "\n", + " identifier \n", + "0 b6ad7fc36f33b5e05f58c7fca06688a6 \n", + "1 b0903af480266f27802fe5c38c277c9e \n", + "2 64ca12b7e26a65b90335c0702ea0faba \n", + "3 5ac2f8150aa9f3a6b1599df08cc2f0c7 \n", + "4 dfe30081bae020d12094279926136b9c \n", + "... ... \n", + "1826667 dba9aa428f843b79ae69dfacfe8fc579 \n", + "1826668 93f1fcfc6ba4fa68f92eb4b4a619fcf0 \n", + "1826669 c8bbbd25df2c158767ceef42c3237f23 \n", + "1826670 738f0a8b5088b5056bc3b32eff2dca1f \n", + "1826671 4c5a6195434377380b4e6ae63b2e9cf6 \n", + "\n", + "[1826672 rows x 11 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_tickets" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1826672 entries, 0 to 1826671\n", + "Data columns (total 11 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 id int64 \n", + " 1 number object \n", + " 2 created_at object \n", + " 3 updated_at object \n", + " 4 purchase_id int64 \n", + " 5 product_id int64 \n", + " 6 is_from_subscription bool \n", + " 7 type_of int64 \n", + " 8 supplier_id int64 \n", + " 9 barcode float64\n", + " 10 identifier object \n", + "dtypes: bool(1), float64(1), int64(5), object(4)\n", + "memory usage: 141.1+ MB\n" + ] + } + ], + "source": [ + "df1_tickets.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c1b42769-03c7-4785-92ce-5e1e6b41908d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "number 0.0\n", + "created_at 0.0\n", + "updated_at 0.0\n", + "purchase_id 0.0\n", + "product_id 0.0\n", + "is_from_subscription 0.0\n", + "type_of 0.0\n", + "supplier_id 0.0\n", + "barcode 100.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_tickets.isna().sum()/len(df1_tickets)*100" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "42896791-2d93-4725-a50b-6c7cbe535ec7", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_619/232847087.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n" + ] + } + ], + "source": [ + "# Selection des variables\n", + "df1_tickets_clean = df1_tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", + "df1_tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)" + ] + }, + { + "cell_type": "markdown", + "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## suppliers.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2e0dada0-9457-484c-aa55-77e44613ecca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamemanually_addedlabelitrupdated_atcreated_atcommissionidentifier
01617j4 administrationFalseNaNNaN2021-07-29 09:21:37.325772+02:002021-07-29 09:21:37.325772+02:00NaN5958b2a060ac3e31678b438892a1bd2e
18non définiFalseNaNNaN2020-09-03 13:16:35.329062+02:002020-09-03 13:16:35.329062+02:00NaN52ff3466787b4d538407372e5f7afe0f
24vadFalseNaNNaN2020-09-03 13:11:23.896992+02:002020-09-03 13:11:23.896992+02:00NaN1225483c97b36018cab2bea14ab78ea6
31fort saint jeanFalseNaNNaN2020-09-03 13:11:23.833073+02:002020-09-03 13:11:23.833073+02:00NaN001b9b4a524fe407150b8235b304d4ec
42j4FalseNaNNaN2020-09-03 13:11:23.888993+02:002020-09-03 13:11:23.888993+02:00NaN6a0cf6edf20060344b465706b61719aa
55revendeurFalseNaNNaN2020-09-03 13:11:23.900987+02:002020-09-03 13:11:23.900987+02:00NaN931239d4acb6214d7e5c98edecfb4916
63vente en ligneFalseNaNNaN2020-09-03 13:11:23.893097+02:002020-09-03 13:11:23.893097+02:00NaNbde8f2ccff510df8572d3214d86b837d
76ccrFalseNaNNaN2020-09-03 13:11:23.904974+02:002020-09-03 13:11:23.904974+02:00NaNb48ec279411f7dbbb68393c61a9724d9
87dabFalseNaNNaN2020-09-03 13:11:23.908970+02:002020-09-03 13:11:23.908970+02:00NaN11c6d471fa4e354e62e684d293694202
\n", + "
" + ], + "text/plain": [ + " id name manually_added label itr \\\n", + "0 1617 j4 administration False NaN NaN \n", + "1 8 non défini False NaN NaN \n", + "2 4 vad False NaN NaN \n", + "3 1 fort saint jean False NaN NaN \n", + "4 2 j4 False NaN NaN \n", + "5 5 revendeur False NaN NaN \n", + "6 3 vente en ligne False NaN NaN \n", + "7 6 ccr False NaN NaN \n", + "8 7 dab False NaN NaN \n", + "\n", + " updated_at created_at \\\n", + "0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n", + "1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n", + "2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n", + "3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n", + "4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n", + "5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n", + "6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n", + "7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n", + "8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n", + "\n", + " commission identifier \n", + "0 NaN 5958b2a060ac3e31678b438892a1bd2e \n", + "1 NaN 52ff3466787b4d538407372e5f7afe0f \n", + "2 NaN 1225483c97b36018cab2bea14ab78ea6 \n", + "3 NaN 001b9b4a524fe407150b8235b304d4ec \n", + "4 NaN 6a0cf6edf20060344b465706b61719aa \n", + "5 NaN 931239d4acb6214d7e5c98edecfb4916 \n", + "6 NaN bde8f2ccff510df8572d3214d86b837d \n", + "7 NaN b48ec279411f7dbbb68393c61a9724d9 \n", + "8 NaN 11c6d471fa4e354e62e684d293694202 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_suppliers" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b583be02-ab60-4e14-9325-0204f203a1af", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 9 entries, 0 to 8\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 9 non-null int64 \n", + " 1 name 9 non-null object \n", + " 2 manually_added 9 non-null bool \n", + " 3 label 0 non-null float64\n", + " 4 itr 0 non-null float64\n", + " 5 updated_at 9 non-null object \n", + " 6 created_at 9 non-null object \n", + " 7 commission 0 non-null float64\n", + " 8 identifier 9 non-null object \n", + "dtypes: bool(1), float64(3), int64(1), object(4)\n", + "memory usage: 713.0+ bytes\n" + ] + } + ], + "source": [ + "df1_suppliers.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "name 0.0\n", + "manually_added 0.0\n", + "label 100.0\n", + "itr 100.0\n", + "updated_at 0.0\n", + "created_at 0.0\n", + "commission 100.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_suppliers.isna().sum()/len(df1_suppliers)*100" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_619/302783287.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n" + ] + } + ], + "source": [ + "# Selection des variables\n", + "df1_suppliers_clean = df1_suppliers[['id', 'name']]\n", + "df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4de7e2e2-6da4-4618-8444-b524399c5493", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsupplier_name
01617j4 administration
18non défini
24vad
31fort saint jean
42j4
55revendeur
63vente en ligne
76ccr
87dab
\n", + "
" + ], + "text/plain": [ + " id supplier_name\n", + "0 1617 j4 administration\n", + "1 8 non défini\n", + "2 4 vad\n", + "3 1 fort saint jean\n", + "4 2 j4\n", + "5 5 revendeur\n", + "6 3 vente en ligne\n", + "7 6 ccr\n", + "8 7 dab" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_suppliers_clean" + ] + }, + { + "cell_type": "markdown", + "id": "0a6df975-c7fc-45bc-92af-a0bdab17d795", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## type_ofs.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a02f6594-3e91-4e87-bbb6-649c28d4f7e9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamechildrencreated_atupdated_atidentifier
01Atelierpricing_formula2021-01-05 11:55:51.188106+01:002021-01-05 11:55:51.188106+01:00623ec4067827558b28972cf39fe81ee7
12Billet en nombrepricing_formula2021-01-11 12:13:19.286301+01:002021-01-11 12:13:19.286301+01:00a53d313a97296ee37caa066dbfe7a45c
23Groupepricing_formula2021-01-11 12:19:22.842917+01:002021-01-11 12:19:22.842917+01:001ab143efc3b85acbbc752fe8eb2b0b86
34Revendeurpricing_formula2021-01-12 12:34:20.481236+01:002021-01-12 12:34:20.481236+01:008b332723366a07e1eef5f1c92f9ae067
45Cinéma scolairepricing_formula2021-01-25 19:16:05.141719+01:002021-01-25 19:16:05.141719+01:00a12e62cb4c4f47e7406bd8fbff2bfe30
56Musée famillepricing_formula2021-01-25 19:23:06.692627+01:002021-01-25 19:23:06.692627+01:001ec6c19283111ccb3ed67f52d414470e
67Spectacle famillepricing_formula2021-01-25 19:28:21.390016+01:002021-01-25 19:28:21.390016+01:0005e2104f1b74ced229c06847d6e91938
78Masterclasspricing_formula2021-01-25 19:31:05.076904+01:002021-01-25 19:31:05.076904+01:009cc946edfb25e11b4282f58db16e6ae9
89Spectaclepricing_formula2021-01-25 19:38:41.260535+01:002021-01-25 19:38:41.260535+01:00d88321c347f0e0ab101184cdf25c94bf
910Cinemapricing_formula2021-02-05 11:12:31.932576+01:002021-02-05 11:12:31.932576+01:000870fef2bfcd5b30a12e4f5c7f4aaba7
1011Museepricing_formula2021-02-05 11:52:05.468207+01:002021-02-05 11:52:05.468207+01:008ba8934454cc62c7cdb3eb6e1b39df0c
1112Tarifs pleincategory2023-03-13 11:31:50.528331+01:002023-03-13 11:31:50.528331+01:00a6969df76efc15d157be48e87a7bcf9a
\n", + "
" + ], + "text/plain": [ + " id name children created_at \\\n", + "0 1 Atelier pricing_formula 2021-01-05 11:55:51.188106+01:00 \n", + "1 2 Billet en nombre pricing_formula 2021-01-11 12:13:19.286301+01:00 \n", + "2 3 Groupe pricing_formula 2021-01-11 12:19:22.842917+01:00 \n", + "3 4 Revendeur pricing_formula 2021-01-12 12:34:20.481236+01:00 \n", + "4 5 Cinéma scolaire pricing_formula 2021-01-25 19:16:05.141719+01:00 \n", + "5 6 Musée famille pricing_formula 2021-01-25 19:23:06.692627+01:00 \n", + "6 7 Spectacle famille pricing_formula 2021-01-25 19:28:21.390016+01:00 \n", + "7 8 Masterclass pricing_formula 2021-01-25 19:31:05.076904+01:00 \n", + "8 9 Spectacle pricing_formula 2021-01-25 19:38:41.260535+01:00 \n", + "9 10 Cinema pricing_formula 2021-02-05 11:12:31.932576+01:00 \n", + "10 11 Musee pricing_formula 2021-02-05 11:52:05.468207+01:00 \n", + "11 12 Tarifs plein category 2023-03-13 11:31:50.528331+01:00 \n", + "\n", + " updated_at identifier \n", + "0 2021-01-05 11:55:51.188106+01:00 623ec4067827558b28972cf39fe81ee7 \n", + "1 2021-01-11 12:13:19.286301+01:00 a53d313a97296ee37caa066dbfe7a45c \n", + "2 2021-01-11 12:19:22.842917+01:00 1ab143efc3b85acbbc752fe8eb2b0b86 \n", + "3 2021-01-12 12:34:20.481236+01:00 8b332723366a07e1eef5f1c92f9ae067 \n", + "4 2021-01-25 19:16:05.141719+01:00 a12e62cb4c4f47e7406bd8fbff2bfe30 \n", + "5 2021-01-25 19:23:06.692627+01:00 1ec6c19283111ccb3ed67f52d414470e \n", + "6 2021-01-25 19:28:21.390016+01:00 05e2104f1b74ced229c06847d6e91938 \n", + "7 2021-01-25 19:31:05.076904+01:00 9cc946edfb25e11b4282f58db16e6ae9 \n", + "8 2021-01-25 19:38:41.260535+01:00 d88321c347f0e0ab101184cdf25c94bf \n", + "9 2021-02-05 11:12:31.932576+01:00 0870fef2bfcd5b30a12e4f5c7f4aaba7 \n", + "10 2021-02-05 11:52:05.468207+01:00 8ba8934454cc62c7cdb3eb6e1b39df0c \n", + "11 2023-03-13 11:31:50.528331+01:00 a6969df76efc15d157be48e87a7bcf9a " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_type_ofs" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e9c8d32b-22f4-4581-8af7-31cc1c31fa0e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 12 entries, 0 to 11\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 12 non-null int64 \n", + " 1 name 12 non-null object\n", + " 2 children 12 non-null object\n", + " 3 created_at 12 non-null object\n", + " 4 updated_at 12 non-null object\n", + " 5 identifier 12 non-null object\n", + "dtypes: int64(1), object(5)\n", + "memory usage: 704.0+ bytes\n" + ] + } + ], + "source": [ + "df1_type_ofs.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_619/81842251.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n" + ] + } + ], + "source": [ + "# Selection des variables\n", + "df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n", + "df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)" + ] + }, + { + "cell_type": "markdown", + "id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## purchases.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idpurchase_datecustomer_idcreated_atupdated_atnumberidentifier
051456622019-07-17 11:17:53+02:0066322021-12-28 20:48:51.569237+01:002021-12-28 20:48:51.569237+01:00fa80c83b29a268b45728c910a8afcf7982877c41df26f832eb823a83acd1a172
149416422018-10-31 11:59:00+01:0012021-12-28 20:31:48.196681+01:002022-03-03 17:52:21.958861+01:00597b6c06adfe6acc539b29b657b80da0e7102ebe65526c427245533ebabe66e5
250888602018-10-31 12:45:12+01:0012021-12-28 20:46:34.703542+01:002021-12-28 20:46:34.703542+01:004a7f6baaf9be6a99e3fead7f7e981fa8af75c4ae53d1b6957875538355b162e1
350888622018-10-31 13:07:12+01:0012021-12-28 20:46:34.704773+01:002021-12-28 20:46:34.704773+01:001d83dfad44b73070d1c6d5875d0edd2d4b2fe34659b177209b07270ae1043b40
450888632018-10-31 13:08:50+01:0012021-12-28 20:46:34.705453+01:002021-12-28 20:46:34.705453+01:007bfe2bc9c1670c973d0960e3fd408cf8b115f04a99b94df9e4a32185844f0998
........................
74224580076952023-11-08 17:51:19+01:0012561332023-11-09 07:51:33.920187+01:002023-11-09 07:51:33.920187+01:0099ad774dedbad43feb73514765d2f0bad68558180b4bf2e8a945724843655775
74224680076962023-11-08 18:17:51+01:0012561342023-11-09 07:51:33.921967+01:002023-11-09 07:51:33.921967+01:00c1511614c511c5f95980172690179102f5102d910a7731091f239ad7b0df35b4
74224780076972023-11-08 18:23:54+01:0012561352023-11-09 07:51:33.923034+01:002023-11-09 07:51:33.923034+01:0033b64b39cc53428b4f17d65ff5b93104e2b917626be60cc2c3207cc037fe69e4
74224880076982023-11-08 19:32:18+01:0012561362023-11-09 07:51:33.924135+01:002023-11-09 07:51:33.924135+01:009ae0b129e704b3d9c093ce9c7c4e50395bfa23236c31f8562c3a0233c1b53b31
74224980076992023-11-08 20:30:28+01:0012561372023-11-09 07:51:33.925382+01:002023-11-09 07:51:33.925382+01:00d31ced089c2b1f90479257a4686f9306d86b1e0de3ff01eaf04fbcd031ac5fef
\n", + "

742250 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " id purchase_date customer_id \\\n", + "0 5145662 2019-07-17 11:17:53+02:00 6632 \n", + "1 4941642 2018-10-31 11:59:00+01:00 1 \n", + "2 5088860 2018-10-31 12:45:12+01:00 1 \n", + "3 5088862 2018-10-31 13:07:12+01:00 1 \n", + "4 5088863 2018-10-31 13:08:50+01:00 1 \n", + "... ... ... ... \n", + "742245 8007695 2023-11-08 17:51:19+01:00 1256133 \n", + "742246 8007696 2023-11-08 18:17:51+01:00 1256134 \n", + "742247 8007697 2023-11-08 18:23:54+01:00 1256135 \n", + "742248 8007698 2023-11-08 19:32:18+01:00 1256136 \n", + "742249 8007699 2023-11-08 20:30:28+01:00 1256137 \n", + "\n", + " created_at updated_at \\\n", + "0 2021-12-28 20:48:51.569237+01:00 2021-12-28 20:48:51.569237+01:00 \n", + "1 2021-12-28 20:31:48.196681+01:00 2022-03-03 17:52:21.958861+01:00 \n", + "2 2021-12-28 20:46:34.703542+01:00 2021-12-28 20:46:34.703542+01:00 \n", + "3 2021-12-28 20:46:34.704773+01:00 2021-12-28 20:46:34.704773+01:00 \n", + "4 2021-12-28 20:46:34.705453+01:00 2021-12-28 20:46:34.705453+01:00 \n", + "... ... ... \n", + "742245 2023-11-09 07:51:33.920187+01:00 2023-11-09 07:51:33.920187+01:00 \n", + "742246 2023-11-09 07:51:33.921967+01:00 2023-11-09 07:51:33.921967+01:00 \n", + "742247 2023-11-09 07:51:33.923034+01:00 2023-11-09 07:51:33.923034+01:00 \n", + "742248 2023-11-09 07:51:33.924135+01:00 2023-11-09 07:51:33.924135+01:00 \n", + "742249 2023-11-09 07:51:33.925382+01:00 2023-11-09 07:51:33.925382+01:00 \n", + "\n", + " number identifier \n", + "0 fa80c83b29a268b45728c910a8afcf79 82877c41df26f832eb823a83acd1a172 \n", + "1 597b6c06adfe6acc539b29b657b80da0 e7102ebe65526c427245533ebabe66e5 \n", + "2 4a7f6baaf9be6a99e3fead7f7e981fa8 af75c4ae53d1b6957875538355b162e1 \n", + "3 1d83dfad44b73070d1c6d5875d0edd2d 4b2fe34659b177209b07270ae1043b40 \n", + "4 7bfe2bc9c1670c973d0960e3fd408cf8 b115f04a99b94df9e4a32185844f0998 \n", + "... ... ... \n", + "742245 99ad774dedbad43feb73514765d2f0ba d68558180b4bf2e8a945724843655775 \n", + "742246 c1511614c511c5f95980172690179102 f5102d910a7731091f239ad7b0df35b4 \n", + "742247 33b64b39cc53428b4f17d65ff5b93104 e2b917626be60cc2c3207cc037fe69e4 \n", + "742248 9ae0b129e704b3d9c093ce9c7c4e5039 5bfa23236c31f8562c3a0233c1b53b31 \n", + "742249 d31ced089c2b1f90479257a4686f9306 d86b1e0de3ff01eaf04fbcd031ac5fef \n", + "\n", + "[742250 rows x 7 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_purchases" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 742250 entries, 0 to 742249\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 742250 non-null int64 \n", + " 1 purchase_date 742250 non-null object\n", + " 2 customer_id 742250 non-null int64 \n", + " 3 created_at 742250 non-null object\n", + " 4 updated_at 742250 non-null object\n", + " 5 number 742250 non-null object\n", + " 6 identifier 742250 non-null object\n", + "dtypes: int64(2), object(5)\n", + "memory usage: 39.6+ MB\n" + ] + } + ], + "source": [ + "df1_purchases.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Nettoyage purchase_date\n", + "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n", + "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "27d18584-228f-4698-85d6-4d23151ea5ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 742250 entries, 0 to 742249\n", + "Data columns (total 7 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 742250 non-null int64 \n", + " 1 purchase_date 742250 non-null datetime64[ns, UTC]\n", + " 2 customer_id 742250 non-null int64 \n", + " 3 created_at 742250 non-null object \n", + " 4 updated_at 742250 non-null object \n", + " 5 number 742250 non-null object \n", + " 6 identifier 742250 non-null object \n", + "dtypes: datetime64[ns, UTC](1), int64(2), object(4)\n", + "memory usage: 39.6+ MB\n" + ] + } + ], + "source": [ + "df1_purchases.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9", + "metadata": {}, + "outputs": [], + "source": [ + "# Selection des variables\n", + "df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]" + ] + }, + { + "cell_type": "markdown", + "id": "53227600-c1c5-48aa-9f5d-db5a23a8a22a", + "metadata": {}, + "source": [ + "## Fusion de l'ensemble des données billétiques" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e0b8b47a-b321-4a79-823c-36a131a78ac7", + "metadata": {}, + "outputs": [], + "source": [ + "# Fusion avec fournisseurs\n", + "df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", + "df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", + "\n", + "# Fusion avec type de tickets\n", + "df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n", + "df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", + "\n", + "# Fusion avec achats\n", + "df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", + "df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ticket_idproduct_idis_from_subscriptionsupplier_nametype_of_ticket_namechildrenpurchase_datecustomer_id
013070859225251Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
113070860224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
213070861224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
313070862224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
413070863224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
...........................
182666720662815405689Falsevente en ligneAtelierpricing_formula2023-11-08 17:23:54+00:001256135
182666820662816403658Falsevente en ligneAtelierpricing_formula2023-11-08 18:32:18+00:001256136
182666920662817403658Falsevente en ligneAtelierpricing_formula2023-11-08 18:32:18+00:001256136
182667020662818403658Falsevente en ligneAtelierpricing_formula2023-11-08 19:30:28+00:001256137
182667120662819403658Falsevente en ligneAtelierpricing_formula2023-11-08 19:30:28+00:001256137
\n", + "

1826672 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " ticket_id product_id is_from_subscription supplier_name \\\n", + "0 13070859 225251 False vente en ligne \n", + "1 13070860 224914 False vente en ligne \n", + "2 13070861 224914 False vente en ligne \n", + "3 13070862 224914 False vente en ligne \n", + "4 13070863 224914 False vente en ligne \n", + "... ... ... ... ... \n", + "1826667 20662815 405689 False vente en ligne \n", + "1826668 20662816 403658 False vente en ligne \n", + "1826669 20662817 403658 False vente en ligne \n", + "1826670 20662818 403658 False vente en ligne \n", + "1826671 20662819 403658 False vente en ligne \n", + "\n", + " type_of_ticket_name children purchase_date \\\n", + "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "... ... ... ... \n", + "1826667 Atelier pricing_formula 2023-11-08 17:23:54+00:00 \n", + "1826668 Atelier pricing_formula 2023-11-08 18:32:18+00:00 \n", + "1826669 Atelier pricing_formula 2023-11-08 18:32:18+00:00 \n", + "1826670 Atelier pricing_formula 2023-11-08 19:30:28+00:00 \n", + "1826671 Atelier pricing_formula 2023-11-08 19:30:28+00:00 \n", + "\n", + " customer_id \n", + "0 48187 \n", + "1 48187 \n", + "2 48187 \n", + "3 48187 \n", + "4 48187 \n", + "... ... \n", + "1826667 1256135 \n", + "1826668 1256136 \n", + "1826669 1256136 \n", + "1826670 1256137 \n", + "1826671 1256137 \n", + "\n", + "[1826672 rows x 8 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_ticket_information" + ] + }, + { + "cell_type": "markdown", + "id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75", + "metadata": {}, + "source": [ + "# Utilisation de fonctions" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "d237be96-8c86-4a91-b7a1-487e87a16c3d", + "metadata": {}, + "outputs": [], + "source": [ + "def cleaning_date(df, column_name):\n", + " \"\"\"\n", + " Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n", + "\n", + " Parameters:\n", + " - df: DataFrame\n", + " Le DataFrame contenant la colonne à nettoyer.\n", + " - column_name: str\n", + " Le nom de la colonne à nettoyer.\n", + "\n", + " Returns:\n", + " - DataFrame\n", + " Le DataFrame modifié avec la colonne nettoyée.\n", + " \"\"\"\n", + " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "c1afe322-ff41-4760-819e-0195fed5b27d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 20 entries, 0 to 19\n", + "Data columns (total 2 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 opened_at 8 non-null object \n", + " 1 opened_at_clean 8 non-null datetime64[ns, UTC]\n", + "dtypes: datetime64[ns, UTC](1), object(1)\n", + "memory usage: 448.0+ bytes\n" + ] + } + ], + "source": [ + "# Créer un DataFrame exemple\n", + "df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n", + "\n", + "# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n", + "df_clean = cleaning_date(df_not_clean, 'opened_at')\n", + "df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n", + "\n", + "test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n", + "\n", + "test.info()" + ] + }, + { + "cell_type": "markdown", + "id": "27ecf058-23eb-4018-abbd-68c4ebe7c786", + "metadata": {}, + "source": [ + "## Nettoyage, selection et fusion" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d887898c-6a21-41ed-901d-4d6fdbca5372", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ticket_idproduct_idis_from_subscriptiontype_ofsupplier_namepurchase_datecustomer_id
013070859225251False1vente en ligne2018-12-28 14:47:50+00:0048187
113070860224914False1vente en ligne2018-12-28 14:47:50+00:0048187
213070861224914False1vente en ligne2018-12-28 14:47:50+00:0048187
313070862224914False1vente en ligne2018-12-28 14:47:50+00:0048187
413070863224914False1vente en ligne2018-12-28 14:47:50+00:0048187
........................
182666720662815405689False1vente en ligne2023-11-08 17:23:54+00:001256135
182666820662816403658False1vente en ligne2023-11-08 18:32:18+00:001256136
182666920662817403658False1vente en ligne2023-11-08 18:32:18+00:001256136
182667020662818403658False1vente en ligne2023-11-08 19:30:28+00:001256137
182667120662819403658False1vente en ligne2023-11-08 19:30:28+00:001256137
\n", + "

1826672 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " ticket_id product_id is_from_subscription type_of supplier_name \\\n", + "0 13070859 225251 False 1 vente en ligne \n", + "1 13070860 224914 False 1 vente en ligne \n", + "2 13070861 224914 False 1 vente en ligne \n", + "3 13070862 224914 False 1 vente en ligne \n", + "4 13070863 224914 False 1 vente en ligne \n", + "... ... ... ... ... ... \n", + "1826667 20662815 405689 False 1 vente en ligne \n", + "1826668 20662816 403658 False 1 vente en ligne \n", + "1826669 20662817 403658 False 1 vente en ligne \n", + "1826670 20662818 403658 False 1 vente en ligne \n", + "1826671 20662819 403658 False 1 vente en ligne \n", + "\n", + " purchase_date customer_id \n", + "0 2018-12-28 14:47:50+00:00 48187 \n", + "1 2018-12-28 14:47:50+00:00 48187 \n", + "2 2018-12-28 14:47:50+00:00 48187 \n", + "3 2018-12-28 14:47:50+00:00 48187 \n", + "4 2018-12-28 14:47:50+00:00 48187 \n", + "... ... ... \n", + "1826667 2023-11-08 17:23:54+00:00 1256135 \n", + "1826668 2023-11-08 18:32:18+00:00 1256136 \n", + "1826669 2023-11-08 18:32:18+00:00 1256136 \n", + "1826670 2023-11-08 19:30:28+00:00 1256137 \n", + "1826671 2023-11-08 19:30:28+00:00 1256137 \n", + "\n", + "[1826672 rows x 7 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_ticket_information" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ac9a6373-c1c6-46b5-873b-dc22f17bcbdb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1826672 entries, 0 to 1826671\n", + "Data columns (total 7 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 ticket_id int64 \n", + " 1 product_id int64 \n", + " 2 is_from_subscription bool \n", + " 3 type_of int64 \n", + " 4 supplier_name object \n", + " 5 purchase_date datetime64[ns, UTC]\n", + " 6 customer_id int64 \n", + "dtypes: bool(1), datetime64[ns, UTC](1), int64(4), object(1)\n", + "memory usage: 85.4+ MB\n" + ] + } + ], + "source": [ + "df1_ticket_information.info()" + ] + }, + { + "cell_type": "markdown", + "id": "b1719943-89eb-4ba0-a107-2f96d5d01ec9", + "metadata": {}, + "source": [ + "# Customer information" + ] + }, + { + "cell_type": "markdown", + "id": "a2132ee2-3f22-45fd-b65b-72689c8b672c", + "metadata": {}, + "source": [ + "## Target area" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "da5d4708-7147-4cc8-8686-52d4bcba5a7a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_619/2625134041.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n" + ] + } + ], + "source": [ + "# Target.csv cleaning\n", + "df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n", + "df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n", + "\n", + "# target_type cleaning\n", + "df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n", + "\n", + "#customer_target_mappings cleaning\n", + "df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n", + "\n", + "# Merge target et target_type\n", + "df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n", + "df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n", + "\n", + "# Merge\n", + "df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n", + "df1_targets_full.drop(['target_id'], axis = 1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "b4fa5fe3-ce8e-4b0a-af94-fb468d241bad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 5.080902\n", + "dtype: float64" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n", + "len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n", + "\n", + "# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n", + "df1_targets_test.mean()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "8072bbb7-1360-4882-bb2b-2f43b6beea0d", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idtarget_nametarget_type_is_importtarget_type_name
879345845991consentement optin jeune publicFalsemanual_static_filter
1324945674651DDCP rentrée culturelle 2023Falsemanual_static_filter
2142445448051spectateurs cine dimanche_cine concert_2122Falsemanual_static_filter
2166545449111DDCP Cine 2023Falsemanual_static_filter
2281145457661DDCP OLBJ! 2023Falsemanual_static_filter
5730544579091ddcp_promo_visiteurs occasionnels_musee_8moisFalsemanual_dynamic_filter
5884336888721DDCP promo livemagFalsemanual_static_filter
6681343136461DDCP spectateurs Classique mais pas que 2022Falsemanual_static_filter
6836745476621ddcp_promo_musee_au moins 3 achats_dps8moisFalsemanual_dynamic_filter
7732042855201DDCP spectateurs IminenteFalsemanual_static_filter
8435040378051DDCP spectateurs Marseille Jazz 18-19-21Falsemanual_static_filter
8538345695041DDCP rendez-vous de septembre offre spécialeFalsemanual_static_filter
9286844330641ddcp_promo_plein air_ateliers_jardinsFalsemanual_static_filter
9967038586841Acid ArabFalsemanual_static_filter
10547743218101Arenametrix_bascule tel vers sibFalsemanual_static_filter
16951336979921ddcp_achats billets nb dps 19052021Falsemanual_static_filter
21442129253241consentement optout scolairesFalsemanual_static_filter
23454645759571Portrait de Leila shahidFalsemanual_static_filter
25980837222591consentement optin b2bFalsemanual_static_filter
27438045104231DDCP_marseille_jazz_2023Falsemanual_static_filter
30751151744661ddcp actoral 21-22Falsemanual_static_filter
35750944425261ddcp musique barvaloFalsemanual_static_filter
39292043906421ddcp_md_promo_spectateurs theatre contempoFalsemanual_static_filter
44962044118971FORMATION _ acheteurs optin last yearFalsemanual_dynamic_filter
50380947345911consentement optin mediation specialiseeFalsemanual_static_filter
65122235544261consentement optin b2cFalsemanual_static_filter
65424651822121DDCP spectateurs Festival de Marseille 2023Falsemanual_static_filter
65439551824561rencontres_echelle_spectateurs_2021_2023Falsemanual_static_filter
\n", + "
" + ], + "text/plain": [ + " id customer_id target_name \\\n", + "8793 4584599 1 consentement optin jeune public \n", + "13249 4567465 1 DDCP rentrée culturelle 2023 \n", + "21424 4544805 1 spectateurs cine dimanche_cine concert_2122 \n", + "21665 4544911 1 DDCP Cine 2023 \n", + "22811 4545766 1 DDCP OLBJ! 2023 \n", + "57305 4457909 1 ddcp_promo_visiteurs occasionnels_musee_8mois \n", + "58843 3688872 1 DDCP promo livemag \n", + "66813 4313646 1 DDCP spectateurs Classique mais pas que 2022 \n", + "68367 4547662 1 ddcp_promo_musee_au moins 3 achats_dps8mois \n", + "77320 4285520 1 DDCP spectateurs Iminente \n", + "84350 4037805 1 DDCP spectateurs Marseille Jazz 18-19-21 \n", + "85383 4569504 1 DDCP rendez-vous de septembre offre spéciale \n", + "92868 4433064 1 ddcp_promo_plein air_ateliers_jardins \n", + "99670 3858684 1 Acid Arab \n", + "105477 4321810 1 Arenametrix_bascule tel vers sib \n", + "169513 3697992 1 ddcp_achats billets nb dps 19052021 \n", + "214421 2925324 1 consentement optout scolaires \n", + "234546 4575957 1 Portrait de Leila shahid \n", + "259808 3722259 1 consentement optin b2b \n", + "274380 4510423 1 DDCP_marseille_jazz_2023 \n", + "307511 5174466 1 ddcp actoral 21-22 \n", + "357509 4442526 1 ddcp musique barvalo \n", + "392920 4390642 1 ddcp_md_promo_spectateurs theatre contempo \n", + "449620 4411897 1 FORMATION _ acheteurs optin last year \n", + "503809 4734591 1 consentement optin mediation specialisee \n", + "651222 3554426 1 consentement optin b2c \n", + "654246 5182212 1 DDCP spectateurs Festival de Marseille 2023 \n", + "654395 5182456 1 rencontres_echelle_spectateurs_2021_2023 \n", + "\n", + " target_type_is_import target_type_name \n", + "8793 False manual_static_filter \n", + "13249 False manual_static_filter \n", + "21424 False manual_static_filter \n", + "21665 False manual_static_filter \n", + "22811 False manual_static_filter \n", + "57305 False manual_dynamic_filter \n", + "58843 False manual_static_filter \n", + "66813 False manual_static_filter \n", + "68367 False manual_dynamic_filter \n", + "77320 False manual_static_filter \n", + "84350 False manual_static_filter \n", + "85383 False manual_static_filter \n", + "92868 False manual_static_filter \n", + "99670 False manual_static_filter \n", + "105477 False manual_static_filter \n", + "169513 False manual_static_filter \n", + "214421 False manual_static_filter \n", + "234546 False manual_static_filter \n", + "259808 False manual_static_filter \n", + "274380 False manual_static_filter \n", + "307511 False manual_static_filter \n", + "357509 False manual_static_filter \n", + "392920 False manual_static_filter \n", + "449620 False manual_dynamic_filter \n", + "503809 False manual_static_filter \n", + "651222 False manual_static_filter \n", + "654246 False manual_static_filter \n", + "654395 False manual_static_filter " + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_targets_full[df1_targets_full['customer_id'] == 1]" + ] + }, + { + "cell_type": "markdown", + "id": "2f665824-a026-4acd-8358-b408a61854b4", + "metadata": {}, + "source": [ + "## Campaign area" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "5d05203c-ea30-4208-a29f-fef7737c672e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", + "/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", + "/tmp/ipykernel_9792/1967867975.py:15: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n" + ] + } + ], + "source": [ + "# campaign_stats cleaning \n", + "df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n", + "cleaning_date(df1_campaign_stats_clean, 'opened_at')\n", + "cleaning_date(df1_campaign_stats_clean, 'sent_at')\n", + "cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n", + "\n", + "# campaigns cleaning\n", + "df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n", + "cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n", + "\n", + "# Merge \n", + "df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n", + "df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "8ac634cf-2a30-4ccc-a34d-0fd401a49aaa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 6214808 entries, 0 to 6214807\n", + "Data columns (total 8 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 id int64 \n", + " 1 customer_id int64 \n", + " 2 opened_at datetime64[ns, UTC]\n", + " 3 sent_at datetime64[ns, UTC]\n", + " 4 delivered_at datetime64[ns, UTC]\n", + " 5 campaign_name object \n", + " 6 campaign_service_id int64 \n", + " 7 campaign_sent_at datetime64[ns, UTC]\n", + "dtypes: datetime64[ns, UTC](4), int64(3), object(1)\n", + "memory usage: 379.3+ MB\n" + ] + } + ], + "source": [ + "df1_campaigns_full.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "7d22cdd5-2060-4922-8e04-27b613d4ee27", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
019793112597NaT2021-03-28 16:01:09+00:002021-03-28 16:24:18+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
114211113666NaT2021-03-28 16:01:09+00:002021-03-28 16:21:02+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
213150280561NaT2021-03-28 16:00:59+00:002021-03-28 16:08:45+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
370731010072021-03-28 18:11:06+00:002021-03-28 16:00:59+00:002021-03-28 16:09:47+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
45175103972NaT2021-03-28 16:01:06+00:002021-03-28 16:05:03+00:00Le Mucem chez vous, gardons le lien #224042021-03-27 23:00:00+00:00
...........................
621480383029942661552023-10-23 09:43:25+00:002023-10-23 09:32:33+00:002023-10-23 09:32:34+00:00dre_nov_202313182023-10-23 09:31:17+00:00
62148048303307213552023-10-23 09:44:02+00:002023-10-23 09:32:49+00:002023-10-23 09:32:49+00:00dre_nov_202313182023-10-23 09:31:17+00:00
62148058304346218492023-10-23 09:45:52+00:002023-10-23 09:33:28+00:002023-10-23 09:33:29+00:00dre_nov_202313182023-10-23 09:31:17+00:00
621480683020376677892023-10-23 09:47:32+00:002023-10-23 09:31:53+00:002023-10-23 09:31:54+00:00dre_nov_202313182023-10-23 09:31:17+00:00
62148078304939294154NaT2023-10-23 09:33:54+00:002023-10-23 09:33:55+00:00dre_nov_202313182023-10-23 09:31:17+00:00
\n", + "

6214808 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " id customer_id opened_at \\\n", + "0 19793 112597 NaT \n", + "1 14211 113666 NaT \n", + "2 13150 280561 NaT \n", + "3 7073 101007 2021-03-28 18:11:06+00:00 \n", + "4 5175 103972 NaT \n", + "... ... ... ... \n", + "6214803 8302994 266155 2023-10-23 09:43:25+00:00 \n", + "6214804 8303307 21355 2023-10-23 09:44:02+00:00 \n", + "6214805 8304346 21849 2023-10-23 09:45:52+00:00 \n", + "6214806 8302037 667789 2023-10-23 09:47:32+00:00 \n", + "6214807 8304939 294154 NaT \n", + "\n", + " sent_at delivered_at \\\n", + "0 2021-03-28 16:01:09+00:00 2021-03-28 16:24:18+00:00 \n", + "1 2021-03-28 16:01:09+00:00 2021-03-28 16:21:02+00:00 \n", + "2 2021-03-28 16:00:59+00:00 2021-03-28 16:08:45+00:00 \n", + "3 2021-03-28 16:00:59+00:00 2021-03-28 16:09:47+00:00 \n", + "4 2021-03-28 16:01:06+00:00 2021-03-28 16:05:03+00:00 \n", + "... ... ... \n", + "6214803 2023-10-23 09:32:33+00:00 2023-10-23 09:32:34+00:00 \n", + "6214804 2023-10-23 09:32:49+00:00 2023-10-23 09:32:49+00:00 \n", + "6214805 2023-10-23 09:33:28+00:00 2023-10-23 09:33:29+00:00 \n", + "6214806 2023-10-23 09:31:53+00:00 2023-10-23 09:31:54+00:00 \n", + "6214807 2023-10-23 09:33:54+00:00 2023-10-23 09:33:55+00:00 \n", + "\n", + " campaign_name campaign_service_id \\\n", + "0 Le Mucem chez vous, gardons le lien #22 404 \n", + "1 Le Mucem chez vous, gardons le lien #22 404 \n", + "2 Le Mucem chez vous, gardons le lien #22 404 \n", + "3 Le Mucem chez vous, gardons le lien #22 404 \n", + "4 Le Mucem chez vous, gardons le lien #22 404 \n", + "... ... ... \n", + "6214803 dre_nov_2023 1318 \n", + "6214804 dre_nov_2023 1318 \n", + "6214805 dre_nov_2023 1318 \n", + "6214806 dre_nov_2023 1318 \n", + "6214807 dre_nov_2023 1318 \n", + "\n", + " campaign_sent_at \n", + "0 2021-03-27 23:00:00+00:00 \n", + "1 2021-03-27 23:00:00+00:00 \n", + "2 2021-03-27 23:00:00+00:00 \n", + "3 2021-03-27 23:00:00+00:00 \n", + "4 2021-03-27 23:00:00+00:00 \n", + "... ... \n", + "6214803 2023-10-23 09:31:17+00:00 \n", + "6214804 2023-10-23 09:31:17+00:00 \n", + "6214805 2023-10-23 09:31:17+00:00 \n", + "6214806 2023-10-23 09:31:17+00:00 \n", + "6214807 2023-10-23 09:31:17+00:00 \n", + "\n", + "[6214808 rows x 8 columns]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_campaigns_information" + ] + }, + { + "cell_type": "markdown", + "id": "0a5b24f0-4bca-4cde-a6ba-eb130b38cac4", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Link area" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "bc63bc4e-6cc1-4d35-9635-faf55339e186", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameservice_idcreated_atupdated_atprocess_idreport_urlcategoryto_be_syncedidentifiersent_at
01319613newsletter enseignants janvier 20227212022-01-14 16:06:42.586321+01:002022-02-03 14:17:27.112963+01:00NaNNaN0.0Falseaba3b6fd5d186d28e06ff97135cade7f2022-01-14 00:00:00+01:00
11319586lsf_janvier_20227172022-01-07 11:30:35.315895+01:002022-02-03 14:17:27.116171+01:00NaNNaN0.0False788d986905533aba051261497ecffcbb2022-01-07 00:00:00+01:00
21319282Invitation à déjeuner au Mucem | Vernissage « ...5912021-09-28 12:50:24.448752+02:002022-02-03 14:17:27.119582+01:00NaNNaN0.0False3493894fa4ea036cfc6433c3e2ee63b02021-09-28 00:00:00+02:00
31319283Vacances de la Toussaint - centres des loisirs5902021-09-28 18:01:04.692073+02:002022-02-03 14:17:27.124408+01:00NaNNaN0.0False08b255a5d42b89b0585260b6f2360bdd2021-09-28 00:00:00+02:00
41319636ddcp_promo_md_livemag7302022-01-27 18:00:41.053069+01:002022-02-03 14:17:27.127607+01:00NaNNaN0.0Falsed5cfead94f5350c12c322b5b664544c12022-01-27 00:00:00+01:00
....................................
9521320072dre_gaza01068812022-05-26 09:01:35.523639+02:002022-12-02 17:51:22.614046+01:00NaNNaN0.0False7504adad8bb96320eb3afdd4df6e1f602022-05-26 00:00:00+02:00
953661398DDCP Plan Bis 4 - Marketing direct - MJ5C1832021-06-18 10:30:01.259578+02:002021-09-24 11:56:09.082785+02:00NaNNaN0.0Falsecedebb6e872f539bef8c3f919874e9d72020-07-27 00:00:00+02:00
9541320487Invitation portes ouvertes amitiés9882022-09-29 18:01:33.834090+02:002022-12-02 17:51:23.258324+01:00NaNNaN0.0False9908279ebbf1f9b250ba689db6a0222b2022-09-29 00:00:00+02:00
955906903DDCP PROMO La méditerranée des philosophes #3 ...3102021-07-19 14:07:16.177390+02:002021-09-24 11:56:09.086101+02:00NaNNaN0.0False06eb61b839a0cefee4967c67ccb099dc2020-12-23 00:00:00+01:00
956579313ddcp_promo_automation_manuel_pre_visit4812021-06-08 17:38:54.041310+02:002021-09-24 11:56:09.089394+02:00NaNNaN0.0False9461cce28ebe3e76fb4b931c35a169b02021-06-08 00:00:00+02:00
\n", + "

957 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id name service_id \\\n", + "0 1319613 newsletter enseignants janvier 2022 721 \n", + "1 1319586 lsf_janvier_2022 717 \n", + "2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n", + "3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n", + "4 1319636 ddcp_promo_md_livemag 730 \n", + ".. ... ... ... \n", + "952 1320072 dre_gaza0106 881 \n", + "953 661398 DDCP Plan Bis 4 - Marketing direct - MJ5C 183 \n", + "954 1320487 Invitation portes ouvertes amitiés 988 \n", + "955 906903 DDCP PROMO La méditerranée des philosophes #3 ... 310 \n", + "956 579313 ddcp_promo_automation_manuel_pre_visit 481 \n", + "\n", + " created_at updated_at \\\n", + "0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n", + "1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n", + "2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n", + "3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n", + "4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n", + ".. ... ... \n", + "952 2022-05-26 09:01:35.523639+02:00 2022-12-02 17:51:22.614046+01:00 \n", + "953 2021-06-18 10:30:01.259578+02:00 2021-09-24 11:56:09.082785+02:00 \n", + "954 2022-09-29 18:01:33.834090+02:00 2022-12-02 17:51:23.258324+01:00 \n", + "955 2021-07-19 14:07:16.177390+02:00 2021-09-24 11:56:09.086101+02:00 \n", + "956 2021-06-08 17:38:54.041310+02:00 2021-09-24 11:56:09.089394+02:00 \n", + "\n", + " process_id report_url category to_be_synced \\\n", + "0 NaN NaN 0.0 False \n", + "1 NaN NaN 0.0 False \n", + "2 NaN NaN 0.0 False \n", + "3 NaN NaN 0.0 False \n", + "4 NaN NaN 0.0 False \n", + ".. ... ... ... ... \n", + "952 NaN NaN 0.0 False \n", + "953 NaN NaN 0.0 False \n", + "954 NaN NaN 0.0 False \n", + "955 NaN NaN 0.0 False \n", + "956 NaN NaN 0.0 False \n", + "\n", + " identifier sent_at \n", + "0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n", + "1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n", + "2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n", + "3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n", + "4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 \n", + ".. ... ... \n", + "952 7504adad8bb96320eb3afdd4df6e1f60 2022-05-26 00:00:00+02:00 \n", + "953 cedebb6e872f539bef8c3f919874e9d7 2020-07-27 00:00:00+02:00 \n", + "954 9908279ebbf1f9b250ba689db6a0222b 2022-09-29 00:00:00+02:00 \n", + "955 06eb61b839a0cefee4967c67ccb099dc 2020-12-23 00:00:00+01:00 \n", + "956 9461cce28ebe3e76fb4b931c35a169b0 2021-06-08 00:00:00+02:00 \n", + "\n", + "[957 rows x 11 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_campaigns" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "c19b321f-65f9-4d6c-8c1f-edb2eb9d70e7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idclicked_atlink_idcustomer_idcreated_atupdated_at
012021-03-26 16:30:36+01:0012840332021-03-26 15:30:37.050161+01:002021-03-26 15:30:37.050161+01:00
122021-03-26 17:16:34+01:0021197682021-03-26 16:16:34.950871+01:002021-03-26 16:16:34.950871+01:00
22722021-03-28 20:03:32+02:00421131052021-03-28 18:03:32.736394+02:002021-03-28 18:03:32.736394+02:00
342021-03-26 17:43:19+01:0032722802021-03-26 16:43:19.338321+01:002021-03-26 16:43:19.338321+01:00
452021-03-26 17:46:00+01:0031050952021-03-26 16:46:00.502945+01:002021-03-26 16:46:00.502945+01:00
.....................
1510462435532023-11-09 16:34:27+01:00146669982023-11-09 15:34:29.425425+01:002023-11-09 15:34:29.425425+01:00
1510472435542023-11-09 16:34:35+01:00146709982023-11-09 15:34:37.505505+01:002023-11-09 15:34:37.505505+01:00
1510482435592023-11-09 16:51:15+01:0014686829232023-11-09 15:51:17.439518+01:002023-11-09 15:51:17.439518+01:00
1510492435612023-11-09 16:59:42+01:0014677829232023-11-09 15:59:44.030922+01:002023-11-09 15:59:44.030922+01:00
1510502435642023-11-09 17:16:41+01:001469112543552023-11-09 16:16:43.012932+01:002023-11-09 16:16:43.012932+01:00
\n", + "

151051 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id clicked_at link_id customer_id \\\n", + "0 1 2021-03-26 16:30:36+01:00 1 284033 \n", + "1 2 2021-03-26 17:16:34+01:00 2 119768 \n", + "2 272 2021-03-28 20:03:32+02:00 42 113105 \n", + "3 4 2021-03-26 17:43:19+01:00 3 272280 \n", + "4 5 2021-03-26 17:46:00+01:00 3 105095 \n", + "... ... ... ... ... \n", + "151046 243553 2023-11-09 16:34:27+01:00 14666 998 \n", + "151047 243554 2023-11-09 16:34:35+01:00 14670 998 \n", + "151048 243559 2023-11-09 16:51:15+01:00 14686 82923 \n", + "151049 243561 2023-11-09 16:59:42+01:00 14677 82923 \n", + "151050 243564 2023-11-09 17:16:41+01:00 14691 1254355 \n", + "\n", + " created_at updated_at \n", + "0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n", + "1 2021-03-26 16:16:34.950871+01:00 2021-03-26 16:16:34.950871+01:00 \n", + "2 2021-03-28 18:03:32.736394+02:00 2021-03-28 18:03:32.736394+02:00 \n", + "3 2021-03-26 16:43:19.338321+01:00 2021-03-26 16:43:19.338321+01:00 \n", + "4 2021-03-26 16:46:00.502945+01:00 2021-03-26 16:46:00.502945+01:00 \n", + "... ... ... \n", + "151046 2023-11-09 15:34:29.425425+01:00 2023-11-09 15:34:29.425425+01:00 \n", + "151047 2023-11-09 15:34:37.505505+01:00 2023-11-09 15:34:37.505505+01:00 \n", + "151048 2023-11-09 15:51:17.439518+01:00 2023-11-09 15:51:17.439518+01:00 \n", + "151049 2023-11-09 15:59:44.030922+01:00 2023-11-09 15:59:44.030922+01:00 \n", + "151050 2023-11-09 16:16:43.012932+01:00 2023-11-09 16:16:43.012932+01:00 \n", + "\n", + "[151051 rows x 6 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_link_stats" + ] + }, + { + "cell_type": "markdown", + "id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Exploration variables" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "aaa41688-ea7e-4dba-851c-1f0b0ec43c71", + "metadata": {}, + "outputs": [], + "source": [ + "# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n", + "def suppliers_exploration(suppliers = None) : \n", + " \n", + " # Taux de NaN pour ces colonnes\n", + " label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n", + " itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n", + " commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n", + "\n", + " suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n", + " 'label_na' : [label_na],\n", + " 'itr_na' : [itr_na],\n", + " 'commission_na' : [commission_na]})\n", + "\n", + " return suppliers_desc" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "2fecc2e1-113f-46ed-9065-0b9ee416166e", + "metadata": {}, + "outputs": [], + "source": [ + "df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "55f6170a-36fb-4efb-9810-f982883660cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nb_supplierslabel_naitr_nacommission_na
09100.0100.0100.0
\n", + "
" + ], + "text/plain": [ + " nb_suppliers label_na itr_na commission_na\n", + "0 9 100.0 100.0 100.0" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_suppliers_desc" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "0030fd02-09e3-42f5-9c83-290458a38c29", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET = \"bdc2324-data\"\n", + "liste_folders = fs.ls(BUCKET)\n", + "\n", + "liste_files = []\n", + "for company_folder in liste_folders : \n", + " liste_files.extend(fs.ls(company_folder))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "6b1736d1-8fd7-4fcc-9431-b8bf0c7b4f2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n" + ] + } + ], + "source": [ + "liste_database_select = ['suppliers']\n", + "\n", + "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", + "liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n", + "\n", + "# Afficher le résultat\n", + "print(liste_suppliers)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "226b694b-0b00-4167-b69f-3178902254eb", + "metadata": {}, + "outputs": [], + "source": [ + "# loop to create dataframes from file 2\n", + "def database_loading(database_name = None):\n", + " files_path = database_name\n", + " \n", + " client_number = files_path[0].split(\"/\")[1]\n", + " df_prefix = \"df\" + str(client_number) + \"_\"\n", + " \n", + " for i in range(len(files_path)) :\n", + " current_path = files_path[i]\n", + " with fs.open(current_path, mode=\"rb\") as file_in:\n", + " df = pd.read_csv(file_in)\n", + " # the pattern of the name is df1xxx\n", + " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", + " globals()[nom_dataframe] = df\n", + "\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Notebook_AJ.ipynb b/Notebook_AJ.ipynb deleted file mode 100644 index 19272b5..0000000 --- a/Notebook_AJ.ipynb +++ /dev/null @@ -1,823 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab", - "metadata": {}, - "source": [ - "# Business Data Challenge - Team 1" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "88af2795-8bf9-4df0-a059-be7c28fb4289", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4", - "metadata": {}, - "source": [ - "Configuration de l'accès aux données" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['bdc2324-data/1',\n", - " 'bdc2324-data/10',\n", - " 'bdc2324-data/101',\n", - " 'bdc2324-data/11',\n", - " 'bdc2324-data/12',\n", - " 'bdc2324-data/13',\n", - " 'bdc2324-data/14',\n", - " 'bdc2324-data/2',\n", - " 'bdc2324-data/3',\n", - " 'bdc2324-data/4',\n", - " 'bdc2324-data/5',\n", - " 'bdc2324-data/6',\n", - " 'bdc2324-data/7',\n", - " 'bdc2324-data/8',\n", - " 'bdc2324-data/9']" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "import s3fs\n", - "# Create filesystem object\n", - "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", - "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", - "\n", - "BUCKET = \"bdc2324-data\"\n", - "fs.ls(BUCKET)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763", - "metadata": {}, - "outputs": [], - "source": [ - "# Chargement des fichiers campaign_stats.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56", - "metadata": {}, - "outputs": [], - "source": [ - "# Conversion des dates 'sent_at'\n", - "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n", - "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n", - "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2023-11-09 18:10:45+00:00\n", - "2020-06-02 08:24:08+00:00\n", - "2023-10-12 01:39:48+00:00\n", - "2023-10-10 17:06:29+00:00\n", - "2023-11-01 09:20:48+00:00\n", - "2021-03-31 14:59:02+00:00\n" - ] - } - ], - "source": [ - "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n", - "print(campaign_stats_1['sent_at'].max())\n", - "print(campaign_stats_1['sent_at'].min())\n", - "\n", - "print(campaign_stats_2['sent_at'].max())\n", - "print(campaign_stats_2['sent_at'].min())\n", - "\n", - "print(campaign_stats_3['sent_at'].max())\n", - "print(campaign_stats_3['sent_at'].min())" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2021-03-28 16:01:09+00:00\n", - "1 2021-03-28 16:01:09+00:00\n", - "2 2021-03-28 16:00:59+00:00\n", - "3 2021-03-28 16:00:59+00:00\n", - "4 2021-03-28 16:01:06+00:00\n", - " ... \n", - "6214803 2023-10-23 09:32:33+00:00\n", - "6214804 2023-10-23 09:32:49+00:00\n", - "6214805 2023-10-23 09:33:28+00:00\n", - "6214806 2023-10-23 09:31:53+00:00\n", - "6214807 2023-10-23 09:33:54+00:00\n", - "Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "campaign_stats_1['sent_at']" - ] - }, - { - "cell_type": "markdown", - "id": "31f2edbf-5661-4516-9835-06d4da615c13", - "metadata": {}, - "source": [ - "### Customersplus.csv" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n" - ] - } - ], - "source": [ - "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " customers_plus_2 = pd.read_csv(file_in, sep=\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "460f853a-68c0-42a7-9877-b83d3aaec813", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", - " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", - " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", - " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", - " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", - " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", - " 'average_purchase_delay', 'average_price_basket',\n", - " 'average_ticket_basket', 'total_price', 'preferred_category',\n", - " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", - " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", - " 'tenant_id'],\n", - " dtype='object')" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "customers_plus_1.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5a9398f-72fc-4548-9f53-b20b372144b2", - "metadata": {}, - "outputs": [], - "source": [ - "customers_plus_1.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638", - "metadata": {}, - "outputs": [], - "source": [ - "customers_plus_1['id'].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440", - "metadata": {}, - "outputs": [], - "source": [ - "customers_plus_2['id'].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b40a653e-013f-48d0-8b57-0284587b36c5", - "metadata": {}, - "outputs": [], - "source": [ - "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "id": "32fa2215-3c79-40b5-8643-755865959fc7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 61, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n", - "# Exemple id commun = caractéristiques communes\n", - "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n", - "\n", - "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id 0.000000\n", - "lastname 43.461341\n", - "firstname 44.995588\n", - "birthdate 96.419870\n", - "email 8.622075\n", - "street_id 0.000000\n", - "created_at 0.000000\n", - "updated_at 0.000000\n", - "civility 100.000000\n", - "is_partner 0.000000\n", - "extra 100.000000\n", - "deleted_at 100.000000\n", - "reference 100.000000\n", - "gender 0.000000\n", - "is_email_true 0.000000\n", - "extra_field 100.000000\n", - "identifier 0.000000\n", - "opt_in 0.000000\n", - "structure_id 88.072380\n", - "note 99.403421\n", - "profession 95.913503\n", - "language 99.280945\n", - "mcp_contact_id 34.876141\n", - "need_reload 0.000000\n", - "last_buying_date 51.653431\n", - "max_price 51.653431\n", - "ticket_sum 0.000000\n", - "average_price 8.639195\n", - "fidelity 0.000000\n", - "average_purchase_delay 51.653431\n", - "average_price_basket 51.653431\n", - "average_ticket_basket 51.653431\n", - "total_price 43.014236\n", - "preferred_category 100.000000\n", - "preferred_supplier 100.000000\n", - "preferred_formula 100.000000\n", - "purchase_count 0.000000\n", - "first_buying_date 51.653431\n", - "last_visiting_date 100.000000\n", - "zipcode 71.176564\n", - "country 5.459418\n", - "age 96.419870\n", - "tenant_id 0.000000\n", - "dtype: float64\n" - ] - } - ], - "source": [ - "pd.DataFrame(customers_plus_1.isna().mean()*100)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "6f6ce60d-0912-497d-9108-330acccef394", - "metadata": {}, - "outputs": [], - "source": [ - "# Chargement de toutes les données\n", - "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n", - "\n", - "for nom_base in liste_base:\n", - " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n", - " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idlastnamefirstnamebirthdateemailstreet_idcreated_atupdated_atcivilityis_partner...tenant_idid_xcustomer_idpurchase_datetype_ofis_from_subscriptionamountis_full_pricestart_date_timeevent_name
0405082lastname405082NaNNaNNaN62023-01-12 06:30:31.197484+01:002023-01-12 06:30:31.197484+01:00NaNFalse...15569924234050822023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
1405082lastname405082NaNNaNNaN62023-01-12 06:30:31.197484+01:002023-01-12 06:30:31.197484+01:00NaNFalse...15569924234050822023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
2411168lastname411168NaNNaNNaN62023-03-17 06:30:35.431967+01:002023-03-17 06:30:35.431967+01:00NaNFalse...155610539344111682023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
3411168lastname411168NaNNaNNaN62023-03-17 06:30:35.431967+01:002023-03-17 06:30:35.431967+01:00NaNFalse...155610539344111682023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
44380lastname4380firstname4380NaNNaN12021-04-22 14:51:55.432952+02:002022-04-14 11:41:33.738500+02:00NaNFalse...1556118914143802020-11-26 13:12:53+01:003False51.3False2020-12-01 20:00:00+01:00iphigenie en tauride
..................................................................
31896419095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561090839190952019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896519095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561090839190952019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896619095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561090839190952019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896719095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561244277190952019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
31896819095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561244277190952019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
\n", - "

318969 rows × 52 columns

\n", - "
" - ], - "text/plain": [ - " id lastname firstname birthdate email \\\n", - "0 405082 lastname405082 NaN NaN NaN \n", - "1 405082 lastname405082 NaN NaN NaN \n", - "2 411168 lastname411168 NaN NaN NaN \n", - "3 411168 lastname411168 NaN NaN NaN \n", - "4 4380 lastname4380 firstname4380 NaN NaN \n", - "... ... ... ... ... ... \n", - "318964 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", - "318965 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", - "318966 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", - "318967 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", - "318968 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", - "\n", - " street_id created_at \\\n", - "0 6 2023-01-12 06:30:31.197484+01:00 \n", - "1 6 2023-01-12 06:30:31.197484+01:00 \n", - "2 6 2023-03-17 06:30:35.431967+01:00 \n", - "3 6 2023-03-17 06:30:35.431967+01:00 \n", - "4 1 2021-04-22 14:51:55.432952+02:00 \n", - "... ... ... \n", - "318964 6 2021-04-22 15:06:30.120537+02:00 \n", - "318965 6 2021-04-22 15:06:30.120537+02:00 \n", - "318966 6 2021-04-22 15:06:30.120537+02:00 \n", - "318967 6 2021-04-22 15:06:30.120537+02:00 \n", - "318968 6 2021-04-22 15:06:30.120537+02:00 \n", - "\n", - " updated_at civility is_partner ... \\\n", - "0 2023-01-12 06:30:31.197484+01:00 NaN False ... \n", - "1 2023-01-12 06:30:31.197484+01:00 NaN False ... \n", - "2 2023-03-17 06:30:35.431967+01:00 NaN False ... \n", - "3 2023-03-17 06:30:35.431967+01:00 NaN False ... \n", - "4 2022-04-14 11:41:33.738500+02:00 NaN False ... \n", - "... ... ... ... ... \n", - "318964 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", - "318965 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", - "318966 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", - "318967 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", - "318968 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", - "\n", - " tenant_id id_x customer_id purchase_date type_of \\\n", - "0 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n", - "1 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n", - "2 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n", - "3 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n", - "4 1556 1189141 4380 2020-11-26 13:12:53+01:00 3 \n", - "... ... ... ... ... ... \n", - "318964 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n", - "318965 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n", - "318966 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n", - "318967 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n", - "318968 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n", - "\n", - " is_from_subscription amount is_full_price start_date_time \\\n", - "0 False 13.0 False 2023-02-06 20:00:00+01:00 \n", - "1 False 13.0 False 2023-02-06 20:00:00+01:00 \n", - "2 False 62.0 False 2023-03-19 16:00:00+01:00 \n", - "3 False 62.0 False 2023-03-19 16:00:00+01:00 \n", - "4 False 51.3 False 2020-12-01 20:00:00+01:00 \n", - "... ... ... ... ... \n", - "318964 False 4.5 False 2019-05-27 20:00:00+02:00 \n", - "318965 False 4.5 False 2019-05-27 20:00:00+02:00 \n", - "318966 False 4.5 False 2019-05-27 20:00:00+02:00 \n", - "318967 False 5.5 False 2020-02-03 20:00:00+01:00 \n", - "318968 False 5.5 False 2020-02-03 20:00:00+01:00 \n", - "\n", - " event_name \n", - "0 zaide \n", - "1 zaide \n", - "2 luisa miller \n", - "3 luisa miller \n", - "4 iphigenie en tauride \n", - "... ... \n", - "318964 entre femmes \n", - "318965 entre femmes \n", - "318966 entre femmes \n", - "318967 a boire et a manger \n", - "318968 a boire et a manger \n", - "\n", - "[318969 rows x 52 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Jointure\n", - "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n", - "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n", - "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n", - "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n", - "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", - "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n", - "df_customer_event" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/Notebook_AR.ipynb b/Notebook_AR.ipynb index 9107796..18b06d1 100644 --- a/Notebook_AR.ipynb +++ b/Notebook_AR.ipynb @@ -6103,6 +6103,403 @@ "representation_theme.head()" ] }, + { + "cell_type": "markdown", + "id": "e274e3cc-1b41-43e0-8412-1563166060cb", + "metadata": {}, + "source": [ + "## Price Table" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "c52621e7-01de-48dc-b572-2974542a8be5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : bdc2324-data/1/1product_packs.csv\n", + "Shape : (1, 6)\n", + "Number of columns : 4\n", + "Columns : Index(['id', 'identifier', 'name', 'type_of'], dtype='object')\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnametype_of
01NaN0
\n", + "
" + ], + "text/plain": [ + " id name type_of\n", + "0 1 NaN 0" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "product_packs = load_dataset(\"1product_packs.csv\")\n", + "product_packs.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "9e4f60ab-9a2c-4090-b0c4-f9a1530b2d39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : bdc2324-data/1/1pricing_formulas.csv\n", + "Shape : (556, 6)\n", + "Number of columns : 4\n", + "Columns : Index(['id', 'identifier', 'name', 'extra_field'], dtype='object')\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameextra_field
041909visite mécènes 1h30NaN
1502entree mucem tp( expo picasso)NaN
2504nombre de personnes cinemaNaN
3117spectacle tarif e famille trNaN
41496billet nb famille mecene 1aNaN
\n", + "
" + ], + "text/plain": [ + " id name extra_field\n", + "0 41909 visite mécènes 1h30 NaN\n", + "1 502 entree mucem tp( expo picasso) NaN\n", + "2 504 nombre de personnes cinema NaN\n", + "3 117 spectacle tarif e famille tr NaN\n", + "4 1496 billet nb famille mecene 1a NaN" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pricing_formula = load_dataset(\"1pricing_formulas.csv\")\n", + "pricing_formula.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "247b5c45-a18a-4cfd-86b4-d3453e157bcd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : bdc2324-data/1/1type_of_pricing_formulas.csv\n", + "Shape : (568, 6)\n", + "Number of columns : 4\n", + "Columns : Index(['id', 'type_of_id', 'pricing_formula_id', 'identifier'], dtype='object')\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtype_of_idpricing_formula_id
011127
1212425
2312937
34148
4517
\n", + "
" + ], + "text/plain": [ + " id type_of_id pricing_formula_id\n", + "0 1 1 127\n", + "1 2 1 2425\n", + "2 3 1 2937\n", + "3 4 1 48\n", + "4 5 1 7" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type_pricing_formula = load_dataset(\"1type_of_pricing_formulas.csv\")\n", + "type_pricing_formula.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "4b48f7b3-0f06-4ef6-9355-5016af82f49c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : bdc2324-data/1/1products_groups.csv\n", + "Shape : (92973, 9)\n", + "Number of columns : 7\n", + "Columns : Index(['id', 'category_id', 'pricing_formula_id', 'representation_id',\n", + " 'percent_price', 'max_price', 'min_price'],\n", + " dtype='object')\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcategory_idpricing_formula_idrepresentation_idpercent_pricemax_pricemin_price
027358971534100.00.00.0
11567735982519100.00.00.0
21438716798046100.00.00.0
327702371563100.00.00.0
4271791311914192100.00.00.0
\n", + "
" + ], + "text/plain": [ + " id category_id pricing_formula_id representation_id percent_price \\\n", + "0 2735 8 97 1534 100.0 \n", + "1 156773 5 9 82519 100.0 \n", + "2 14387 16 79 8046 100.0 \n", + "3 2770 2 37 1563 100.0 \n", + "4 27179 13 119 14192 100.0 \n", + "\n", + " max_price min_price \n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 " + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "product_groups = load_dataset(\"1products_groups.csv\")\n", + "product_groups.head()" + ] + }, { "cell_type": "markdown", "id": "71c26a38-6818-42df-8aee-0135681a5563", @@ -6741,6 +7138,9 @@ "outputs": [], "source": [ "def uniform_product_df():\n", + " \"\"\"\n", + " This function returns the uniform product dataset\n", + " \"\"\"\n", " print(\"Products theme columns : \", products_theme.columns)\n", " print(\"\\n Representation theme columns : \", representation_theme.columns)\n", " print(\"\\n Events theme columns : \", events_theme.columns)\n", diff --git a/TP_merge_tables_clean.ipynb b/TP_merge_tables_clean.ipynb new file mode 100644 index 0000000..66b5228 --- /dev/null +++ b/TP_merge_tables_clean.ipynb @@ -0,0 +1,1760 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8c56d518-3634-4492-b249-0d8ef33dd527", + "metadata": {}, + "source": [ + "## First steps : package importations, set up working environment and import data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dede42d9-1262-45f7-bd7a-586ae800092a", + "metadata": {}, + "outputs": [], + "source": [ + "# importations\n", + "\n", + "import os \n", + "import s3fs\n", + "import pandas as pd\n", + "import re\n", + "from datetime import datetime, timezone, timedelta\n", + "import math\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6ce34b58-b5ba-4b54-ba4d-fc82ef01b09c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1',\n", + " 'bdc2324-data/10',\n", + " 'bdc2324-data/101',\n", + " 'bdc2324-data/11',\n", + " 'bdc2324-data/12',\n", + " 'bdc2324-data/13',\n", + " 'bdc2324-data/14',\n", + " 'bdc2324-data/2',\n", + " 'bdc2324-data/3',\n", + " 'bdc2324-data/4',\n", + " 'bdc2324-data/5',\n", + " 'bdc2324-data/6',\n", + " 'bdc2324-data/7',\n", + " 'bdc2324-data/8',\n", + " 'bdc2324-data/9']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# bucket for accessing the data\n", + "\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "\n", + "fs = s3fs.S3FileSystem(client_kwargs = {\"endpoint_url\" : S3_ENDPOINT_URL})\n", + "BUCKET = \"bdc2324-data\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8eb13dd3-53c7-4a70-94a4-846168473aa1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1/1campaign_stats.csv',\n", + " 'bdc2324-data/1/1campaigns.csv',\n", + " 'bdc2324-data/1/1categories.csv',\n", + " 'bdc2324-data/1/1countries.csv',\n", + " 'bdc2324-data/1/1currencies.csv',\n", + " 'bdc2324-data/1/1customer_target_mappings.csv',\n", + " 'bdc2324-data/1/1customersplus.csv',\n", + " 'bdc2324-data/1/1event_types.csv',\n", + " 'bdc2324-data/1/1events.csv',\n", + " 'bdc2324-data/1/1facilities.csv',\n", + " 'bdc2324-data/1/1link_stats.csv',\n", + " 'bdc2324-data/1/1pricing_formulas.csv',\n", + " 'bdc2324-data/1/1product_packs.csv',\n", + " 'bdc2324-data/1/1products.csv',\n", + " 'bdc2324-data/1/1products_groups.csv',\n", + " 'bdc2324-data/1/1purchases.csv',\n", + " 'bdc2324-data/1/1representation_category_capacities.csv',\n", + " 'bdc2324-data/1/1representations.csv',\n", + " 'bdc2324-data/1/1seasons.csv',\n", + " 'bdc2324-data/1/1structure_tag_mappings.csv',\n", + " 'bdc2324-data/1/1suppliers.csv',\n", + " 'bdc2324-data/1/1tags.csv',\n", + " 'bdc2324-data/1/1target_types.csv',\n", + " 'bdc2324-data/1/1targets.csv',\n", + " 'bdc2324-data/1/1tickets.csv',\n", + " 'bdc2324-data/1/1type_of_categories.csv',\n", + " 'bdc2324-data/1/1type_of_pricing_formulas.csv',\n", + " 'bdc2324-data/1/1type_ofs.csv']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "FILE_PATH_S3 = fs.ls(BUCKET)[0] # focus on the company number 1\n", + "files_path = fs.ls(FILE_PATH_S3)\n", + "files_path" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1ea66c4e-1307-4f19-836e-3104fba2ff41", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_487/2894332003.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(file_in)\n" + ] + } + ], + "source": [ + "# loop to create dataframes related to company 1\n", + "\n", + "client_number = files_path[0].split(\"/\")[1]\n", + "print(client_number)\n", + "df_prefix = \"df\" + str(client_number) + \"_\"\n", + "\n", + "for i in range(len(files_path)) :\n", + " current_path = files_path[i]\n", + " with fs.open(current_path, mode=\"rb\") as file_in:\n", + " df = pd.read_csv(file_in)\n", + " # the pattern of the name is df1xxx\n", + " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", + " globals()[nom_dataframe] = df" + ] + }, + { + "cell_type": "markdown", + "id": "13d70b2c-6580-4caf-b839-10f72b2e0b39", + "metadata": {}, + "source": [ + "## Target, target types and customer target mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4dbc7fea-ac3b-4348-83fb-dfb1a460f936", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idis_importnamecreated_atupdated_atidentifier
069Falsemanual_dynamic_filter2020-11-30 09:46:18.881030+01:002020-11-30 09:46:18.881030+01:00e0f4b8693184850fefd6d2a38f10584e
148Truemanual_structure2020-11-04 17:16:19.548275+01:002020-11-04 17:16:19.548275+01:00382bca214204a2d3462f5ec2728d5d1e
21Truemanual_import2020-10-14 18:37:40.521623+02:002020-10-14 18:37:40.521623+02:0012213df2ce68a624e4c0070521437bac
356Falsemanual_static_filter2020-11-04 18:08:37.233486+01:002020-11-04 18:08:37.233486+01:00fb27e81baa4debc6a4e1a8639c20e808
\n", + "
" + ], + "text/plain": [ + " id is_import name created_at \\\n", + "0 69 False manual_dynamic_filter 2020-11-30 09:46:18.881030+01:00 \n", + "1 48 True manual_structure 2020-11-04 17:16:19.548275+01:00 \n", + "2 1 True manual_import 2020-10-14 18:37:40.521623+02:00 \n", + "3 56 False manual_static_filter 2020-11-04 18:08:37.233486+01:00 \n", + "\n", + " updated_at identifier \n", + "0 2020-11-30 09:46:18.881030+01:00 e0f4b8693184850fefd6d2a38f10584e \n", + "1 2020-11-04 17:16:19.548275+01:00 382bca214204a2d3462f5ec2728d5d1e \n", + "2 2020-10-14 18:37:40.521623+02:00 12213df2ce68a624e4c0070521437bac \n", + "3 2020-11-04 18:08:37.233486+01:00 fb27e81baa4debc6a4e1a8639c20e808 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 1. target types\n", + "df1_target_types.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0e9f5dcb-0dc3-4052-b866-e5c4cb954a1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtarget_type_idnamecreated_atupdated_at
021756DDCP PROMO Art contemporain - salle de chauffe...2021-01-04 15:00:05.401899+01:002021-03-02 18:38:19.025969+01:00
170156consentement optin scolaires2021-12-21 16:03:59.840785+01:002022-02-18 17:23:44.761388+01:00
213456DDCP Newsletter jeune public2020-11-10 09:43:19.667471+01:002021-03-02 18:38:19.052304+01:00
370056consentement optout scolaires2021-12-21 16:01:57.524946+01:002022-02-18 17:23:44.807776+01:00
496456DDCP achat billet nbr dep 190520212022-04-14 10:58:17.142834+02:002022-04-14 10:58:23.677264+02:00
\n", + "
" + ], + "text/plain": [ + " id target_type_id name \\\n", + "0 217 56 DDCP PROMO Art contemporain - salle de chauffe... \n", + "1 701 56 consentement optin scolaires \n", + "2 134 56 DDCP Newsletter jeune public \n", + "3 700 56 consentement optout scolaires \n", + "4 964 56 DDCP achat billet nbr dep 19052021 \n", + "\n", + " created_at updated_at \n", + "0 2021-01-04 15:00:05.401899+01:00 2021-03-02 18:38:19.025969+01:00 \n", + "1 2021-12-21 16:03:59.840785+01:00 2022-02-18 17:23:44.761388+01:00 \n", + "2 2020-11-10 09:43:19.667471+01:00 2021-03-02 18:38:19.052304+01:00 \n", + "3 2021-12-21 16:01:57.524946+01:00 2022-02-18 17:23:44.807776+01:00 \n", + "4 2022-04-14 10:58:17.142834+02:00 2022-04-14 10:58:23.677264+02:00 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 2. targets\n", + "df1_targets.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c5c62302-370a-462f-bd79-eac31593f65c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idtarget_idcreated_atupdated_atnameextra_field
011848246454001302021-09-23 09:35:47.617275+02:002021-09-23 09:35:47.617275+02:00NaNNaN
111848256454003452021-09-23 09:35:47.668846+02:002021-09-23 09:35:47.668846+02:00NaNNaN
211848286454021262021-09-23 12:02:51.253269+02:002021-09-23 12:02:51.253269+02:00NaNNaN
311848296454031262021-09-23 12:20:47.394480+02:002021-09-23 12:20:47.394480+02:00NaNNaN
412957706473013462021-09-28 16:02:29.372608+02:002021-09-28 16:02:29.372608+02:00NaNNaN
\n", + "
" + ], + "text/plain": [ + " id customer_id target_id created_at \\\n", + "0 1184824 645400 130 2021-09-23 09:35:47.617275+02:00 \n", + "1 1184825 645400 345 2021-09-23 09:35:47.668846+02:00 \n", + "2 1184828 645402 126 2021-09-23 12:02:51.253269+02:00 \n", + "3 1184829 645403 126 2021-09-23 12:20:47.394480+02:00 \n", + "4 1295770 647301 346 2021-09-28 16:02:29.372608+02:00 \n", + "\n", + " updated_at name extra_field \n", + "0 2021-09-23 09:35:47.617275+02:00 NaN NaN \n", + "1 2021-09-23 09:35:47.668846+02:00 NaN NaN \n", + "2 2021-09-23 12:02:51.253269+02:00 NaN NaN \n", + "3 2021-09-23 12:20:47.394480+02:00 NaN NaN \n", + "4 2021-09-28 16:02:29.372608+02:00 NaN NaN " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 3. customer target mapping\n", + "\n", + "df1_customer_target_mappings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1a87cebf-c1dd-408d-a523-26633419da1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtarget_type_idnametarget_type_is_importtarget_type_name
021756DDCP PROMO Art contemporain - salle de chauffe...Falsemanual_static_filter
170156consentement optin scolairesFalsemanual_static_filter
213456DDCP Newsletter jeune publicFalsemanual_static_filter
370056consentement optout scolairesFalsemanual_static_filter
496456DDCP achat billet nbr dep 19052021Falsemanual_static_filter
\n", + "
" + ], + "text/plain": [ + " id target_type_id name \\\n", + "0 217 56 DDCP PROMO Art contemporain - salle de chauffe... \n", + "1 701 56 consentement optin scolaires \n", + "2 134 56 DDCP Newsletter jeune public \n", + "3 700 56 consentement optout scolaires \n", + "4 964 56 DDCP achat billet nbr dep 19052021 \n", + "\n", + " target_type_is_import target_type_name \n", + "0 False manual_static_filter \n", + "1 False manual_static_filter \n", + "2 False manual_static_filter \n", + "3 False manual_static_filter \n", + "4 False manual_static_filter " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 4.1. merge target with target type\n", + "\n", + "df1_targets_full = pd.merge(df1_targets[[\"id\", \"target_type_id\", \"name\"]], df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\"), left_on='target_type_id', right_on='target_type_id', how='left')\n", + "df1_targets_full.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d48c1fff-73c2-4e75-8799-da2b80694be7", + "metadata": {}, + "outputs": [], + "source": [ + "# 4.2. merge df1_customer_target_mappings with df1_targets_full\n", + "\n", + "# change the position of the column target type id\n", + "\n", + "# Spécifiez le nom de la colonne à déplacer et la colonne après laquelle vous souhaitez la placer\n", + "column_to_move = 'target_type_id'\n", + "\n", + "# Récupérez l'index de la colonne de référence\n", + "reference_index = df1_targets_full.columns.get_loc(\"target_type_name\")\n", + "\n", + "# Créez une copie de la colonne que vous voulez déplacer\n", + "column_copy = df1_targets_full[column_to_move].copy()\n", + "\n", + "# Supprimez la colonne d'origine\n", + "df1_targets_full = df1_targets_full.drop(column_to_move, axis=1)\n", + "\n", + "# Utilisez la méthode insert pour déplacer la colonne à la nouvelle position\n", + "df1_targets_full.insert(reference_index - 1, column_to_move, column_copy)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a874514a-c7dc-42d4-a440-dedd3a270e24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
target_idtarget_nametarget_type_is_importtarget_type_idtarget_type_name
0217DDCP PROMO Art contemporain - salle de chauffe...False56manual_static_filter
1701consentement optin scolairesFalse56manual_static_filter
2134DDCP Newsletter jeune publicFalse56manual_static_filter
3700consentement optout scolairesFalse56manual_static_filter
4964DDCP achat billet nbr dep 19052021False56manual_static_filter
\n", + "
" + ], + "text/plain": [ + " target_id target_name \\\n", + "0 217 DDCP PROMO Art contemporain - salle de chauffe... \n", + "1 701 consentement optin scolaires \n", + "2 134 DDCP Newsletter jeune public \n", + "3 700 consentement optout scolaires \n", + "4 964 DDCP achat billet nbr dep 19052021 \n", + "\n", + " target_type_is_import target_type_id target_type_name \n", + "0 False 56 manual_static_filter \n", + "1 False 56 manual_static_filter \n", + "2 False 56 manual_static_filter \n", + "3 False 56 manual_static_filter \n", + "4 False 56 manual_static_filter " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_targets_full = df1_targets_full.rename(columns=lambda x: 'target_' + x if not x.startswith('target_') else x)\n", + "df1_targets_full.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "0db0172a-5119-4b7f-97f8-36fc5c985205", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idtarget_idtarget_nametarget_type_is_importtarget_type_idtarget_type_name
01184824645400130DDCP PROMO Réseau livresFalse56manual_static_filter
11184825645400345Inscrits NL générale site webFalse56manual_static_filter
21184828645402126DDCP PROMO Art contemporainFalse56manual_static_filter
31184829645403126DDCP PROMO Art contemporainFalse56manual_static_filter
41295770647301346Votre première listeFalse56manual_static_filter
........................
7680192737545666983345Inscrits NL générale site webFalse56manual_static_filter
7680202737546666983346Votre première listeFalse56manual_static_filter
7680212737575666986346Votre première listeFalse56manual_static_filter
7680222737576666987345Inscrits NL générale site webFalse56manual_static_filter
7680232737577666987346Votre première listeFalse56manual_static_filter
\n", + "

768024 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " id customer_id target_id target_name \\\n", + "0 1184824 645400 130 DDCP PROMO Réseau livres \n", + "1 1184825 645400 345 Inscrits NL générale site web \n", + "2 1184828 645402 126 DDCP PROMO Art contemporain \n", + "3 1184829 645403 126 DDCP PROMO Art contemporain \n", + "4 1295770 647301 346 Votre première liste \n", + "... ... ... ... ... \n", + "768019 2737545 666983 345 Inscrits NL générale site web \n", + "768020 2737546 666983 346 Votre première liste \n", + "768021 2737575 666986 346 Votre première liste \n", + "768022 2737576 666987 345 Inscrits NL générale site web \n", + "768023 2737577 666987 346 Votre première liste \n", + "\n", + " target_type_is_import target_type_id target_type_name \n", + "0 False 56 manual_static_filter \n", + "1 False 56 manual_static_filter \n", + "2 False 56 manual_static_filter \n", + "3 False 56 manual_static_filter \n", + "4 False 56 manual_static_filter \n", + "... ... ... ... \n", + "768019 False 56 manual_static_filter \n", + "768020 False 56 manual_static_filter \n", + "768021 False 56 manual_static_filter \n", + "768022 False 56 manual_static_filter \n", + "768023 False 56 manual_static_filter \n", + "\n", + "[768024 rows x 7 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# finally, merge\n", + "\n", + "# pour df1_customer_target_mappings on enlève les colonnes name, extra_field, et updated_at (valeur égale à created_at)\n", + "# note : by making a left join on df1_customer_target_mappings, we suppress 2 targets that have no customer associated\n", + "\n", + "df1_customer_targets = pd.merge(df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]], \n", + " df1_targets_full, left_on='target_id', right_on='target_id', how='left')\n", + "df1_customer_targets" + ] + }, + { + "cell_type": "markdown", + "id": "52326267-c5ba-4e21-b8ab-4b4c62de75d1", + "metadata": {}, + "source": [ + "## Campaign stats, campaigns" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "06dca910-5c07-4ee1-bbf2-3b11b48ba1f2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameservice_idcreated_atupdated_atprocess_idreport_urlcategoryto_be_syncedidentifiersent_at
01319613newsletter enseignants janvier 20227212022-01-14 16:06:42.586321+01:002022-02-03 14:17:27.112963+01:00NaNNaN0.0Falseaba3b6fd5d186d28e06ff97135cade7f2022-01-14 00:00:00+01:00
11319586lsf_janvier_20227172022-01-07 11:30:35.315895+01:002022-02-03 14:17:27.116171+01:00NaNNaN0.0False788d986905533aba051261497ecffcbb2022-01-07 00:00:00+01:00
21319282Invitation à déjeuner au Mucem | Vernissage « ...5912021-09-28 12:50:24.448752+02:002022-02-03 14:17:27.119582+01:00NaNNaN0.0False3493894fa4ea036cfc6433c3e2ee63b02021-09-28 00:00:00+02:00
31319283Vacances de la Toussaint - centres des loisirs5902021-09-28 18:01:04.692073+02:002022-02-03 14:17:27.124408+01:00NaNNaN0.0False08b255a5d42b89b0585260b6f2360bdd2021-09-28 00:00:00+02:00
41319636ddcp_promo_md_livemag7302022-01-27 18:00:41.053069+01:002022-02-03 14:17:27.127607+01:00NaNNaN0.0Falsed5cfead94f5350c12c322b5b664544c12022-01-27 00:00:00+01:00
\n", + "
" + ], + "text/plain": [ + " id name service_id \\\n", + "0 1319613 newsletter enseignants janvier 2022 721 \n", + "1 1319586 lsf_janvier_2022 717 \n", + "2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n", + "3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n", + "4 1319636 ddcp_promo_md_livemag 730 \n", + "\n", + " created_at updated_at \\\n", + "0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n", + "1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n", + "2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n", + "3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n", + "4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n", + "\n", + " process_id report_url category to_be_synced \\\n", + "0 NaN NaN 0.0 False \n", + "1 NaN NaN 0.0 False \n", + "2 NaN NaN 0.0 False \n", + "3 NaN NaN 0.0 False \n", + "4 NaN NaN 0.0 False \n", + "\n", + " identifier sent_at \n", + "0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n", + "1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n", + "2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n", + "3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n", + "4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 1. campaigns\n", + "df1_campaigns.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "83eaa447-9144-41ed-9e26-f0f23799a8fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcampaign_idcustomer_idopened_atsent_atdelivered_atcreated_atupdated_at
01979358112597NaN2021-03-28 18:01:09+02:002021-03-28 18:24:18+02:002021-03-28 18:34:20.616136+02:002022-04-15 22:52:04.397693+02:00
11421158113666NaN2021-03-28 18:01:09+02:002021-03-28 18:21:02+02:002021-03-28 18:21:04.297213+02:002022-04-15 22:52:04.397693+02:00
21315058280561NaN2021-03-28 18:00:59+02:002021-03-28 18:08:45+02:002021-03-28 18:18:49.991042+02:002022-04-15 22:52:04.397693+02:00
37073581010072021-03-28 20:11:06+02:002021-03-28 18:00:59+02:002021-03-28 18:09:47+02:002021-03-28 18:09:50.915354+02:002022-04-15 22:52:04.397693+02:00
4517558103972NaN2021-03-28 18:01:06+02:002021-03-28 18:05:03+02:002021-03-28 18:05:08.507398+02:002022-04-15 22:52:04.397693+02:00
\n", + "
" + ], + "text/plain": [ + " id campaign_id customer_id opened_at \\\n", + "0 19793 58 112597 NaN \n", + "1 14211 58 113666 NaN \n", + "2 13150 58 280561 NaN \n", + "3 7073 58 101007 2021-03-28 20:11:06+02:00 \n", + "4 5175 58 103972 NaN \n", + "\n", + " sent_at delivered_at \\\n", + "0 2021-03-28 18:01:09+02:00 2021-03-28 18:24:18+02:00 \n", + "1 2021-03-28 18:01:09+02:00 2021-03-28 18:21:02+02:00 \n", + "2 2021-03-28 18:00:59+02:00 2021-03-28 18:08:45+02:00 \n", + "3 2021-03-28 18:00:59+02:00 2021-03-28 18:09:47+02:00 \n", + "4 2021-03-28 18:01:06+02:00 2021-03-28 18:05:03+02:00 \n", + "\n", + " created_at updated_at \n", + "0 2021-03-28 18:34:20.616136+02:00 2022-04-15 22:52:04.397693+02:00 \n", + "1 2021-03-28 18:21:04.297213+02:00 2022-04-15 22:52:04.397693+02:00 \n", + "2 2021-03-28 18:18:49.991042+02:00 2022-04-15 22:52:04.397693+02:00 \n", + "3 2021-03-28 18:09:50.915354+02:00 2022-04-15 22:52:04.397693+02:00 \n", + "4 2021-03-28 18:05:08.507398+02:00 2022-04-15 22:52:04.397693+02:00 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 2. campaigns stats\n", + "df1_campaign_stats.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "7f25eb1b-e7c8-4715-bc30-7ac29a7181ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcampaign_idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
01979358112597NaN2021-03-28 18:01:09+02:002021-03-28 18:24:18+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
11421158113666NaN2021-03-28 18:01:09+02:002021-03-28 18:21:02+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
21315058280561NaN2021-03-28 18:00:59+02:002021-03-28 18:08:45+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
37073581010072021-03-28 20:11:06+02:002021-03-28 18:00:59+02:002021-03-28 18:09:47+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
4517558103972NaN2021-03-28 18:01:06+02:002021-03-28 18:05:03+02:00Le Mucem chez vous, gardons le lien #224042021-03-28 00:00:00+01:00
\n", + "
" + ], + "text/plain": [ + " id campaign_id customer_id opened_at \\\n", + "0 19793 58 112597 NaN \n", + "1 14211 58 113666 NaN \n", + "2 13150 58 280561 NaN \n", + "3 7073 58 101007 2021-03-28 20:11:06+02:00 \n", + "4 5175 58 103972 NaN \n", + "\n", + " sent_at delivered_at \\\n", + "0 2021-03-28 18:01:09+02:00 2021-03-28 18:24:18+02:00 \n", + "1 2021-03-28 18:01:09+02:00 2021-03-28 18:21:02+02:00 \n", + "2 2021-03-28 18:00:59+02:00 2021-03-28 18:08:45+02:00 \n", + "3 2021-03-28 18:00:59+02:00 2021-03-28 18:09:47+02:00 \n", + "4 2021-03-28 18:01:06+02:00 2021-03-28 18:05:03+02:00 \n", + "\n", + " campaign_name campaign_service_id \\\n", + "0 Le Mucem chez vous, gardons le lien #22 404 \n", + "1 Le Mucem chez vous, gardons le lien #22 404 \n", + "2 Le Mucem chez vous, gardons le lien #22 404 \n", + "3 Le Mucem chez vous, gardons le lien #22 404 \n", + "4 Le Mucem chez vous, gardons le lien #22 404 \n", + "\n", + " campaign_sent_at \n", + "0 2021-03-28 00:00:00+01:00 \n", + "1 2021-03-28 00:00:00+01:00 \n", + "2 2021-03-28 00:00:00+01:00 \n", + "3 2021-03-28 00:00:00+01:00 \n", + "4 2021-03-28 00:00:00+01:00 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 3. merge campaigns and campaigns stats\n", + "\n", + "df1_campaigns_full = pd.merge(df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]], \n", + " df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\"),\n", + " on = \"campaign_id\", how = \"left\")\n", + "df1_campaigns_full.head()" + ] + }, + { + "cell_type": "markdown", + "id": "87fc686a-4a80-40ab-9987-20d2774f3055", + "metadata": {}, + "source": [ + "## Link stats" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "2f9df2d0-8a23-496b-8e92-617285f64530", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idclicked_atlink_idcustomer_idcreated_atupdated_at
012021-03-26 16:30:36+01:0012840332021-03-26 15:30:37.050161+01:002021-03-26 15:30:37.050161+01:00
122021-03-26 17:16:34+01:0021197682021-03-26 16:16:34.950871+01:002021-03-26 16:16:34.950871+01:00
22722021-03-28 20:03:32+02:00421131052021-03-28 18:03:32.736394+02:002021-03-28 18:03:32.736394+02:00
342021-03-26 17:43:19+01:0032722802021-03-26 16:43:19.338321+01:002021-03-26 16:43:19.338321+01:00
452021-03-26 17:46:00+01:0031050952021-03-26 16:46:00.502945+01:002021-03-26 16:46:00.502945+01:00
.....................
1510462435532023-11-09 16:34:27+01:00146669982023-11-09 15:34:29.425425+01:002023-11-09 15:34:29.425425+01:00
1510472435542023-11-09 16:34:35+01:00146709982023-11-09 15:34:37.505505+01:002023-11-09 15:34:37.505505+01:00
1510482435592023-11-09 16:51:15+01:0014686829232023-11-09 15:51:17.439518+01:002023-11-09 15:51:17.439518+01:00
1510492435612023-11-09 16:59:42+01:0014677829232023-11-09 15:59:44.030922+01:002023-11-09 15:59:44.030922+01:00
1510502435642023-11-09 17:16:41+01:001469112543552023-11-09 16:16:43.012932+01:002023-11-09 16:16:43.012932+01:00
\n", + "

151051 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id clicked_at link_id customer_id \\\n", + "0 1 2021-03-26 16:30:36+01:00 1 284033 \n", + "1 2 2021-03-26 17:16:34+01:00 2 119768 \n", + "2 272 2021-03-28 20:03:32+02:00 42 113105 \n", + "3 4 2021-03-26 17:43:19+01:00 3 272280 \n", + "4 5 2021-03-26 17:46:00+01:00 3 105095 \n", + "... ... ... ... ... \n", + "151046 243553 2023-11-09 16:34:27+01:00 14666 998 \n", + "151047 243554 2023-11-09 16:34:35+01:00 14670 998 \n", + "151048 243559 2023-11-09 16:51:15+01:00 14686 82923 \n", + "151049 243561 2023-11-09 16:59:42+01:00 14677 82923 \n", + "151050 243564 2023-11-09 17:16:41+01:00 14691 1254355 \n", + "\n", + " created_at updated_at \n", + "0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n", + "1 2021-03-26 16:16:34.950871+01:00 2021-03-26 16:16:34.950871+01:00 \n", + "2 2021-03-28 18:03:32.736394+02:00 2021-03-28 18:03:32.736394+02:00 \n", + "3 2021-03-26 16:43:19.338321+01:00 2021-03-26 16:43:19.338321+01:00 \n", + "4 2021-03-26 16:46:00.502945+01:00 2021-03-26 16:46:00.502945+01:00 \n", + "... ... ... \n", + "151046 2023-11-09 15:34:29.425425+01:00 2023-11-09 15:34:29.425425+01:00 \n", + "151047 2023-11-09 15:34:37.505505+01:00 2023-11-09 15:34:37.505505+01:00 \n", + "151048 2023-11-09 15:51:17.439518+01:00 2023-11-09 15:51:17.439518+01:00 \n", + "151049 2023-11-09 15:59:44.030922+01:00 2023-11-09 15:59:44.030922+01:00 \n", + "151050 2023-11-09 16:16:43.012932+01:00 2023-11-09 16:16:43.012932+01:00 \n", + "\n", + "[151051 rows x 6 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_link_stats" + ] + }, + { + "cell_type": "markdown", + "id": "aad6fb14-9694-4c1e-9885-1ebe0f38afe3", + "metadata": {}, + "source": [ + "## Bonus : peut-on lier link stats et campaign ? Non, les dates à laquelle le client clique sur le lie/ouvre la campagne ne permettent pas de faire coincider link_id et campaign_id" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "8be7c974-72c9-4e31-a874-d7e5d2719fb3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idclicked_atlink_idcustomer_idcreated_atupdated_at
012021-03-26 16:30:36+01:0012840332021-03-26 15:30:37.050161+01:002021-03-26 15:30:37.050161+01:00
7526140182021-05-10 18:07:59+02:003122840332021-05-10 16:08:00.541322+02:002021-05-10 16:08:00.541322+02:00
968481334492021-03-25 08:42:22+01:0042840332022-04-15 22:51:01.994343+02:002022-04-15 22:51:01.994343+02:00
1157282075442022-08-23 10:33:04+02:00123652840332022-08-23 08:33:06.498908+02:002022-08-23 08:33:06.498908+02:00
\n", + "
" + ], + "text/plain": [ + " id clicked_at link_id customer_id \\\n", + "0 1 2021-03-26 16:30:36+01:00 1 284033 \n", + "7526 14018 2021-05-10 18:07:59+02:00 312 284033 \n", + "96848 133449 2021-03-25 08:42:22+01:00 4 284033 \n", + "115728 207544 2022-08-23 10:33:04+02:00 12365 284033 \n", + "\n", + " created_at updated_at \n", + "0 2021-03-26 15:30:37.050161+01:00 2021-03-26 15:30:37.050161+01:00 \n", + "7526 2021-05-10 16:08:00.541322+02:00 2021-05-10 16:08:00.541322+02:00 \n", + "96848 2022-04-15 22:51:01.994343+02:00 2022-04-15 22:51:01.994343+02:00 \n", + "115728 2022-08-23 08:33:06.498908+02:00 2022-08-23 08:33:06.498908+02:00 " + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_link_stats[df1_link_stats[\"customer_id\"] == 284033]" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "902e9947-58e1-44f4-b634-1239b0e4df02", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcampaign_idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
403064340363764284033NaN2021-03-21 18:01:22+01:002021-03-21 18:08:04+01:00Le Mucem chez vous, gardons le lien #213982021-03-21 00:00:00+01:00
\n", + "
" + ], + "text/plain": [ + " id campaign_id customer_id opened_at \\\n", + "4030643 4036376 4 284033 NaN \n", + "\n", + " sent_at delivered_at \\\n", + "4030643 2021-03-21 18:01:22+01:00 2021-03-21 18:08:04+01:00 \n", + "\n", + " campaign_name campaign_service_id \\\n", + "4030643 Le Mucem chez vous, gardons le lien #21 398 \n", + "\n", + " campaign_sent_at \n", + "4030643 2021-03-21 00:00:00+01:00 " + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_campaigns_full[ (df1_campaigns_full[\"customer_id\"] == 284033) & (df1_campaigns_full[\"campaign_id\"] == 4)]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Traitement_Fanta.ipynb b/Traitement_Fanta.ipynb index a456ad0..5e3529d 100644 --- a/Traitement_Fanta.ipynb +++ b/Traitement_Fanta.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "b6035982-9ff4-4013-9792-2d50e10db3d1", "metadata": {}, "outputs": [ @@ -66,7 +66,7 @@ " 'bdc2324-data/1/1type_ofs.csv']" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "b86c935d-124f-453f-80dd-83ea6770d09c", "metadata": {}, "outputs": [], @@ -94,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "id": "f6d0b27c-0ecd-406b-b042-6c3802dd68fd", "metadata": {}, "outputs": [ @@ -102,7 +102,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1054/1008972637.py:5: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_447/1008972637.py:5: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")\n" ] } @@ -117,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "id": "2a6b5e22-3370-457f-83b7-dd1e13663229", "metadata": {}, "outputs": [ @@ -127,7 +127,7 @@ "'bdc2324-data/1/1type_ofs.csv'" ] }, - "execution_count": 11, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -136,6 +136,22 @@ "FILE_PATH_S3_fanta" ] }, + { + "cell_type": "markdown", + "id": "79012186-ea51-4252-843e-36a9bbe3847e", + "metadata": {}, + "source": [ + "# Analyse exploratoire " + ] + }, + { + "cell_type": "markdown", + "id": "1a365f29-4766-47d8-9796-24a5271867b2", + "metadata": {}, + "source": [ + "## I. Base type_of_pricing_formulas" + ] + }, { "cell_type": "markdown", "id": "bcc14f93-2289-44eb-816b-a51049b258df", @@ -145,21 +161,17 @@ ] }, { - "cell_type": "code", - "execution_count": 12, - "id": "7f8083ec-3d08-4c4e-8d26-a5a4948c1c02", + "cell_type": "raw", + "id": "ab2ec4c4-9d38-4aeb-8202-9116df3cdd66", "metadata": {}, - "outputs": [], "source": [ "dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" ] }, { - "cell_type": "code", - "execution_count": 16, - "id": "a6de36fa-3d35-4b20-97f2-3e24d54c7f99", + "cell_type": "markdown", + "id": "88759b4a-2633-478d-abce-29abeac376d1", "metadata": {}, - "outputs": [], "source": [ "def verifier_donnees_manquantes(base):\n", " donnees_manquantes = base.isna().sum()\n", @@ -168,24 +180,9 @@ ] }, { - "cell_type": "code", - "execution_count": 17, - "id": "1c261736-11fb-44f4-a4b1-830cae755a65", + "cell_type": "markdown", + "id": "df3075b4-1490-4cf2-a3fe-c6d4e2144ae3", "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'str' object has no attribute 'isna'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[17], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m nom_base \u001b[38;5;129;01min\u001b[39;00m dic_prod_princing:\n\u001b[0;32m----> 2\u001b[0m \u001b[43mverifier_donnees_manquantes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnom_base\u001b[49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[16], line 2\u001b[0m, in \u001b[0;36mverifier_donnees_manquantes\u001b[0;34m(base)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mverifier_donnees_manquantes\u001b[39m(base):\n\u001b[0;32m----> 2\u001b[0m donnees_manquantes \u001b[38;5;241m=\u001b[39m \u001b[43mbase\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43misna\u001b[49m()\u001b[38;5;241m.\u001b[39msum()\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDonnées manquantes pour la base :\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(donnees_manquantes)\n", - "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'isna'" - ] - } - ], "source": [ "for nom_base in dic_prod_princing:\n", " verifier_donnees_manquantes(nom_base)" @@ -193,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, "id": "e0c67c01-e837-4772-b070-d1be0d895a36", "metadata": {}, "outputs": [ @@ -209,20 +206,1492 @@ "dtype: int64" ] }, - "execution_count": 14, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "#detection des Nan d\n", + "\n", "type_of_pricing_formulas.isna().sum()" ] }, { "cell_type": "code", "execution_count": null, + "id": "83a6a48d-effe-4537-b4bb-d5a540b610f1", + "metadata": {}, + "outputs": [], + "source": [ + "#variable retenu:[[\"id\",\"type_of_id\",\"pricing_formula_id\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3eaffaa6-1164-4ee9-a671-8b5eb3df797d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtype_of_idpricing_formula_idcreated_atupdated_atidentifier
0111272021-01-05 11:55:51.226960+01:002021-01-05 11:55:51.226960+01:00cf2918b25e6dcf8c30798ca05c8ec8ed
12124252021-01-05 11:55:51.235606+01:002021-01-05 11:55:51.235606+01:002c8ee3f7c1487d792b6c946314e681f2
23129372021-01-05 11:55:51.240114+01:002021-01-05 11:55:51.240114+01:0044e55c85e4eb59b3c3c01c137a6b25fc
341482021-01-05 11:55:51.244638+01:002021-01-05 11:55:51.244638+01:00ee3bb93b7e2217cd86a49d547fedf6c6
45172021-01-05 11:55:51.249409+01:002021-01-05 11:55:51.249409+01:00ae701668574f1a653d2b21ddfd250620
.....................
563564466562022-02-18 16:15:58.872249+01:002022-02-18 16:15:58.872249+01:00f669824cdca9de9697f07ff3ba365a8d
564565466072022-02-18 16:15:59.231018+01:002022-02-18 16:15:59.231018+01:006421c8146a598758139153b0e7b921ea
565566467002022-02-18 16:15:59.724812+01:002022-02-18 16:15:59.724812+01:006823f6d4d80b322fbfb8b83545a9f96d
566567481182022-02-18 16:16:00.163381+01:002022-02-18 16:16:00.163381+01:0035cfc12584b4d1b94795d97fd0aa56e8
5675697481572023-03-13 11:30:29.480161+01:002023-03-13 11:30:29.480161+01:0055863541f33fd229ac9b54d9ec1f4874
\n", + "

568 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id type_of_id pricing_formula_id created_at \\\n", + "0 1 1 127 2021-01-05 11:55:51.226960+01:00 \n", + "1 2 1 2425 2021-01-05 11:55:51.235606+01:00 \n", + "2 3 1 2937 2021-01-05 11:55:51.240114+01:00 \n", + "3 4 1 48 2021-01-05 11:55:51.244638+01:00 \n", + "4 5 1 7 2021-01-05 11:55:51.249409+01:00 \n", + ".. ... ... ... ... \n", + "563 564 4 6656 2022-02-18 16:15:58.872249+01:00 \n", + "564 565 4 6607 2022-02-18 16:15:59.231018+01:00 \n", + "565 566 4 6700 2022-02-18 16:15:59.724812+01:00 \n", + "566 567 4 8118 2022-02-18 16:16:00.163381+01:00 \n", + "567 569 7 48157 2023-03-13 11:30:29.480161+01:00 \n", + "\n", + " updated_at identifier \n", + "0 2021-01-05 11:55:51.226960+01:00 cf2918b25e6dcf8c30798ca05c8ec8ed \n", + "1 2021-01-05 11:55:51.235606+01:00 2c8ee3f7c1487d792b6c946314e681f2 \n", + "2 2021-01-05 11:55:51.240114+01:00 44e55c85e4eb59b3c3c01c137a6b25fc \n", + "3 2021-01-05 11:55:51.244638+01:00 ee3bb93b7e2217cd86a49d547fedf6c6 \n", + "4 2021-01-05 11:55:51.249409+01:00 ae701668574f1a653d2b21ddfd250620 \n", + ".. ... ... \n", + "563 2022-02-18 16:15:58.872249+01:00 f669824cdca9de9697f07ff3ba365a8d \n", + "564 2022-02-18 16:15:59.231018+01:00 6421c8146a598758139153b0e7b921ea \n", + "565 2022-02-18 16:15:59.724812+01:00 6823f6d4d80b322fbfb8b83545a9f96d \n", + "566 2022-02-18 16:16:00.163381+01:00 35cfc12584b4d1b94795d97fd0aa56e8 \n", + "567 2023-03-13 11:30:29.480161+01:00 55863541f33fd229ac9b54d9ec1f4874 \n", + "\n", + "[568 rows x 6 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type_of_pricing_formulas" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "id": "57298669-8d55-40d5-a5aa-4c5df984eec7", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "type_of_id int64\n", + "pricing_formula_id int64\n", + "created_at object\n", + "updated_at object\n", + "identifier object\n", + "dtype: object" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "type_of_pricing_formulas.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "c11850cb-8833-44c0-a11d-9695d620a42b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtype_of_idpricing_formula_idcreated_atupdated_atidentifier
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, type_of_id, pricing_formula_id, created_at, updated_at, identifier]\n", + "Index: []" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Identification des doublons\n", + "type_of_pricing_formulas.loc[type_of_pricing_formulas['id'].duplicated(keep=False),:]" + ] + }, + { + "cell_type": "markdown", + "id": "7a40de03-5e18-4d3d-a0f8-da960c29fad8", + "metadata": {}, + "source": [ + "## II.products_groups" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "89909175-6734-4e8e-8632-d6f8ca812388", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "percent_price 0\n", + "max_price 0\n", + "min_price 0\n", + "category_id 0\n", + "pricing_formula_id 0\n", + "representation_id 0\n", + "created_at 0\n", + "updated_at 0\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#detection des Nan \n", + "\n", + "products_groups.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0518684-c83c-4f0a-89ea-d7dcfd60051d", + "metadata": {}, + "outputs": [], + "source": [ + "#variable retenu:[[\"id\",\"percent_price\",\"max_price\",\"min_price\",\"category_id\",\"pricing_formula_id\",\"representation_id\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6a187170-96c4-48d2-9568-b270f67e2c27", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "percent_price float64\n", + "max_price float64\n", + "min_price float64\n", + "category_id int64\n", + "pricing_formula_id int64\n", + "representation_id int64\n", + "created_at object\n", + "updated_at object\n", + "dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "products_groups.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2fba2cb0-a6a4-43b2-a854-3be07939c28b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idpercent_pricemax_pricemin_pricecategory_idpricing_formula_idrepresentation_idcreated_atupdated_at
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, percent_price, max_price, min_price, category_id, pricing_formula_id, representation_id, created_at, updated_at]\n", + "Index: []" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Identification des doublons\n", + "products_groups.loc[products_groups[['id','pricing_formula_id','representation_id']].duplicated(keep=False),:]" + ] + }, + { + "cell_type": "markdown", + "id": "5312ac13-8fbd-4c3f-a98a-8c28f079a599", + "metadata": {}, + "source": [ + "## III.pricing_formulas" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "3383a773-0817-4b23-84e7-8d5d0c74b179", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecreated_atupdated_atextra_fieldidentifier
041909visite mécènes 1h302022-07-08 07:08:26.802266+02:002022-07-08 07:08:26.802266+02:00NaN21d4b0043c12b21952b0797d140991a1
1502entree mucem tp( expo picasso)2020-09-03 13:43:59.816765+02:002022-02-18 15:57:55.792581+01:00NaN223b09e6c3f1f75dbf8df019af97a555
2504nombre de personnes cinema2020-09-03 13:43:59.818198+02:002021-01-25 19:16:05.187114+01:00NaNba33b7b6d225a75d713a356b49c4d915
3117spectacle tarif e famille tr2020-09-03 13:21:21.400249+02:002023-03-13 11:30:29.525335+01:00NaNa00b61ad933518856f86e63ca91a5750
41496billet nb famille mecene 1a2020-09-03 14:29:33.320952+02:002021-01-25 19:23:06.816402+01:00NaN7f6013803c242253a5ccde80f780984f
.....................
551529billet nb expo gr2020-09-03 13:43:59.835944+02:002022-02-18 15:57:55.792581+01:00NaN7d888e42abe101fc8b21dc88948c8b74
5523153nb pers visite scolaire rep2020-09-03 16:32:37.068864+02:002022-02-18 15:57:55.792581+01:00NaN3cf21731c25eee650d5b232ee4780563
5535847visite scolaire rep1h002021-06-09 18:10:49.742531+02:002022-02-18 15:55:03.576236+01:00NaNa7bb5a6892d55f0d5ee4ce5786ae5fc6
5545840france billet - entree ts2021-06-09 18:10:49.737576+02:002022-02-18 16:16:00.199543+01:00NaN4c53016fc65847646f600eff853593e5
5555863france billet - entree tp2021-06-09 18:12:49.269924+02:002022-02-18 16:16:00.199543+01:00NaN90e642c0e1ef6bc9f2bc43089798de00
\n", + "

556 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id name created_at \\\n", + "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n", + "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n", + "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n", + "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n", + "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n", + ".. ... ... ... \n", + "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n", + "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n", + "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n", + "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n", + "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n", + "\n", + " updated_at extra_field \\\n", + "0 2022-07-08 07:08:26.802266+02:00 NaN \n", + "1 2022-02-18 15:57:55.792581+01:00 NaN \n", + "2 2021-01-25 19:16:05.187114+01:00 NaN \n", + "3 2023-03-13 11:30:29.525335+01:00 NaN \n", + "4 2021-01-25 19:23:06.816402+01:00 NaN \n", + ".. ... ... \n", + "551 2022-02-18 15:57:55.792581+01:00 NaN \n", + "552 2022-02-18 15:57:55.792581+01:00 NaN \n", + "553 2022-02-18 15:55:03.576236+01:00 NaN \n", + "554 2022-02-18 16:16:00.199543+01:00 NaN \n", + "555 2022-02-18 16:16:00.199543+01:00 NaN \n", + "\n", + " identifier \n", + "0 21d4b0043c12b21952b0797d140991a1 \n", + "1 223b09e6c3f1f75dbf8df019af97a555 \n", + "2 ba33b7b6d225a75d713a356b49c4d915 \n", + "3 a00b61ad933518856f86e63ca91a5750 \n", + "4 7f6013803c242253a5ccde80f780984f \n", + ".. ... \n", + "551 7d888e42abe101fc8b21dc88948c8b74 \n", + "552 3cf21731c25eee650d5b232ee4780563 \n", + "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n", + "554 4c53016fc65847646f600eff853593e5 \n", + "555 90e642c0e1ef6bc9f2bc43089798de00 \n", + "\n", + "[556 rows x 6 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pricing_formulas" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d8130c73-6c5f-45b1-93ae-db7679c8ca56", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "name 0.0\n", + "created_at 0.0\n", + "updated_at 0.0\n", + "extra_field 1.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#detection des Nan \n", + "\n", + "pricing_formulas.isna().sum()/pricing_formulas.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f2909c1-bc6a-443f-a077-84f6ce6b7ab5", + "metadata": {}, + "outputs": [], + "source": [ + "#variable retenu: [[\"id\",\"name\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "44f1dbfd-c3cf-464b-9877-f37fcc61da92", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "name object\n", + "created_at object\n", + "updated_at object\n", + "extra_field float64\n", + "identifier object\n", + "dtype: object" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "pricing_formulas.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6784b41b-da74-4fae-832e-16641ae710c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecreated_atupdated_atextra_fieldidentifier
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, name, created_at, updated_at, extra_field, identifier]\n", + "Index: []" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Identification des doublons\n", + "pricing_formulas.loc[pricing_formulas[['id']].duplicated(keep=False),:]" + ] + }, + { + "cell_type": "markdown", + "id": "2145b0a4-b73d-4530-8c12-a78b1cf86eae", + "metadata": {}, + "source": [ + "## IV. product_packs" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e36b07a7-4f0b-4711-86a0-12a1d8158eef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "name 1.0\n", + "type_of 0.0\n", + "created_at 0.0\n", + "updated_at 0.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#detection des Nan \n", + "\n", + "product_packs.isna().sum()/product_packs.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0887a01-51ea-4034-84fe-dc4dbf2ad949", + "metadata": {}, + "outputs": [], + "source": [ + "#variable retenu:[[\"id\",\"name\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8707396a-f86b-476d-a9f9-c39f8de1d02e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "name float64\n", + "type_of int64\n", + "created_at object\n", + "updated_at object\n", + "identifier object\n", + "dtype: object" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "product_packs.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4b102bd3-924b-43da-8915-be7664c23f97", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnametype_ofcreated_atupdated_atidentifier
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, name, type_of, created_at, updated_at, identifier]\n", + "Index: []" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Identification des doublons\n", + "product_packs.loc[product_packs[['id']].duplicated(keep=False),:]" + ] + }, + { + "cell_type": "markdown", + "id": "cfe0c525-896b-4731-b38e-306ff6ea0c65", + "metadata": {}, + "source": [ + "## V.products" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "968beb24-f70c-4eb6-8b1e-4b04bc7fe9c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0.0\n", + "amount 0.0\n", + "is_full_price 0.0\n", + "representation_id 0.0\n", + "pricing_formula_id 0.0\n", + "created_at 0.0\n", + "updated_at 0.0\n", + "category_id 0.0\n", + "apply_price 0.0\n", + "products_group_id 0.0\n", + "product_pack_id 0.0\n", + "extra_field 1.0\n", + "amount_consumption 1.0\n", + "identifier 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#detection des Nan \n", + "\n", + "products.isna().sum()/products.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "15bc6ac6-67e8-4e2c-9641-7ee8bb2581a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id int64\n", + "amount float64\n", + "is_full_price bool\n", + "representation_id int64\n", + "pricing_formula_id int64\n", + "created_at object\n", + "updated_at object\n", + "category_id int64\n", + "apply_price float64\n", + "products_group_id int64\n", + "product_pack_id int64\n", + "extra_field float64\n", + "amount_consumption float64\n", + "identifier object\n", + "dtype: object" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#type des variables\n", + "\n", + "products.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7daa4f1a-e429-4daf-a2e1-1e311b487e09", + "metadata": {}, + "outputs": [], + "source": [ + "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "dc12b746-6708-4708-826a-acb5a8e665a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecreated_atupdated_atextra_fieldidentifier
041909visite mécènes 1h302022-07-08 07:08:26.802266+02:002022-07-08 07:08:26.802266+02:00NaN21d4b0043c12b21952b0797d140991a1
1502entree mucem tp( expo picasso)2020-09-03 13:43:59.816765+02:002022-02-18 15:57:55.792581+01:00NaN223b09e6c3f1f75dbf8df019af97a555
2504nombre de personnes cinema2020-09-03 13:43:59.818198+02:002021-01-25 19:16:05.187114+01:00NaNba33b7b6d225a75d713a356b49c4d915
3117spectacle tarif e famille tr2020-09-03 13:21:21.400249+02:002023-03-13 11:30:29.525335+01:00NaNa00b61ad933518856f86e63ca91a5750
41496billet nb famille mecene 1a2020-09-03 14:29:33.320952+02:002021-01-25 19:23:06.816402+01:00NaN7f6013803c242253a5ccde80f780984f
.....................
551529billet nb expo gr2020-09-03 13:43:59.835944+02:002022-02-18 15:57:55.792581+01:00NaN7d888e42abe101fc8b21dc88948c8b74
5523153nb pers visite scolaire rep2020-09-03 16:32:37.068864+02:002022-02-18 15:57:55.792581+01:00NaN3cf21731c25eee650d5b232ee4780563
5535847visite scolaire rep1h002021-06-09 18:10:49.742531+02:002022-02-18 15:55:03.576236+01:00NaNa7bb5a6892d55f0d5ee4ce5786ae5fc6
5545840france billet - entree ts2021-06-09 18:10:49.737576+02:002022-02-18 16:16:00.199543+01:00NaN4c53016fc65847646f600eff853593e5
5555863france billet - entree tp2021-06-09 18:12:49.269924+02:002022-02-18 16:16:00.199543+01:00NaN90e642c0e1ef6bc9f2bc43089798de00
\n", + "

556 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id name created_at \\\n", + "0 41909 visite mécènes 1h30 2022-07-08 07:08:26.802266+02:00 \n", + "1 502 entree mucem tp( expo picasso) 2020-09-03 13:43:59.816765+02:00 \n", + "2 504 nombre de personnes cinema 2020-09-03 13:43:59.818198+02:00 \n", + "3 117 spectacle tarif e famille tr 2020-09-03 13:21:21.400249+02:00 \n", + "4 1496 billet nb famille mecene 1a 2020-09-03 14:29:33.320952+02:00 \n", + ".. ... ... ... \n", + "551 529 billet nb expo gr 2020-09-03 13:43:59.835944+02:00 \n", + "552 3153 nb pers visite scolaire rep 2020-09-03 16:32:37.068864+02:00 \n", + "553 5847 visite scolaire rep1h00 2021-06-09 18:10:49.742531+02:00 \n", + "554 5840 france billet - entree ts 2021-06-09 18:10:49.737576+02:00 \n", + "555 5863 france billet - entree tp 2021-06-09 18:12:49.269924+02:00 \n", + "\n", + " updated_at extra_field \\\n", + "0 2022-07-08 07:08:26.802266+02:00 NaN \n", + "1 2022-02-18 15:57:55.792581+01:00 NaN \n", + "2 2021-01-25 19:16:05.187114+01:00 NaN \n", + "3 2023-03-13 11:30:29.525335+01:00 NaN \n", + "4 2021-01-25 19:23:06.816402+01:00 NaN \n", + ".. ... ... \n", + "551 2022-02-18 15:57:55.792581+01:00 NaN \n", + "552 2022-02-18 15:57:55.792581+01:00 NaN \n", + "553 2022-02-18 15:55:03.576236+01:00 NaN \n", + "554 2022-02-18 16:16:00.199543+01:00 NaN \n", + "555 2022-02-18 16:16:00.199543+01:00 NaN \n", + "\n", + " identifier \n", + "0 21d4b0043c12b21952b0797d140991a1 \n", + "1 223b09e6c3f1f75dbf8df019af97a555 \n", + "2 ba33b7b6d225a75d713a356b49c4d915 \n", + "3 a00b61ad933518856f86e63ca91a5750 \n", + "4 7f6013803c242253a5ccde80f780984f \n", + ".. ... \n", + "551 7d888e42abe101fc8b21dc88948c8b74 \n", + "552 3cf21731c25eee650d5b232ee4780563 \n", + "553 a7bb5a6892d55f0d5ee4ce5786ae5fc6 \n", + "554 4c53016fc65847646f600eff853593e5 \n", + "555 90e642c0e1ef6bc9f2bc43089798de00 \n", + "\n", + "[556 rows x 6 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pricing_formulas" + ] + }, + { + "cell_type": "markdown", + "id": "46aad10f-8530-410e-872b-bb253c553a46", + "metadata": {}, + "source": [ + "# jointure entre les bases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4c3edd1-6d58-4c57-b3e4-0ef3529f6b8c", + "metadata": {}, + "outputs": [], + "source": [ + "#dic_prod_princing=['type_of_pricing_formulas','products_groups','pricing_formulas','product_packs','products']" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "eac537e1-bbad-45bc-a85c-12b675da1088", + "metadata": {}, + "outputs": [], + "source": [ + "#Merge1 entre products et pricing_formulas\n", + "base1=products.merge(pricing_formulas, how='left', left_on= 'pricing_formula_id', right_on= 'id', suffixes = (\"_products\", \"_pricing_formula\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "75be3a30-3114-432d-87d6-697533c3c871", + "metadata": {}, + "outputs": [], + "source": [ + "#Merge2 entre base1 et products_groups\n", + "base2=base1.merge(products_groups, how='left', left_on= 'id_pricing_formula', right_on= 'id', suffixes = (\"_merge2\", \"_product_group\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "34a169c6-07a8-4ac3-a9e1-d7e7461f7310", + "metadata": {}, + "outputs": [], + "source": [ + "#Merge3 entre base2 et type_of_pricing_formulas\n", + "base3=base2.merge(type_of_pricing_formulas, how='left', left_on= 'id_pricing_formula', right_on= 'pricing_formula_id', suffixes = (\"_merge3\", \"_type_of_pricing_f\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "f44f40d2-5304-4931-b7e6-fcc06b2657b6", + "metadata": {}, + "outputs": [], + "source": [ + "#Merge4 entre base3 et type_of_pricing_formulas\n", + "df_product_pricing=base3.merge(product_packs, how='left', left_on= 'product_pack_id', right_on= 'id', suffixes = (\"_merge4\", \"_product_pack\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "a28772c3-7bc1-46b4-acc8-1388dc60ec98", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id_productsamountis_full_pricerepresentation_id_merge2pricing_formula_id_merge2created_at_productsupdated_at_productscategory_id_merge2apply_priceproducts_group_id...pricing_formula_idcreated_at_type_of_pricing_fupdated_at_type_of_pricing_fidentifier_merge4idname_product_packtype_ofcreated_atupdated_atidentifier_product_pack
0106829.0False9141142020-09-03 14:09:43.119798+02:002020-09-03 14:09:43.119798+02:00410.010655...114.02021-02-15 17:02:27.395376+01:002021-02-15 17:02:27.395376+01:003706121eb9f43b635bef1433c06f679c1NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
14789.5False2731312020-09-03 13:21:22.711773+02:002020-09-03 13:21:22.711773+02:0010.0471...131.02021-02-05 11:52:05.923905+01:002021-02-05 11:52:05.923905+01:000aceb248607671792298436004b952751NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
22087311.5False2751372020-09-03 14:46:33.589030+02:002020-09-03 14:46:33.589030+02:0010.020825...137.02021-02-05 11:52:05.939898+01:002021-02-05 11:52:05.939898+01:0093002d4637331edd81ffc28b6e8e89c01NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
31571428.0False8251992022-01-28 19:29:23.525722+01:002022-01-28 19:29:23.525722+01:0050.0156773...9.02021-02-05 11:52:06.107939+01:002021-02-05 11:52:06.107939+01:007d0b25bdfff9f366da8be820608c81911NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
413418.5False9932020-09-03 13:29:30.773089+02:002020-09-03 13:29:30.773089+02:0010.01175...93.02021-02-05 11:52:06.004162+01:002021-02-05 11:52:06.004162+01:001dbb0795e8f47cb75ba7cdb08c06be5f1NaN02020-09-03 13:11:24.501197+02:002020-09-03 13:11:24.501197+02:00a764b4bf13a360c7ac2a35ec4ca96c95
\n", + "

5 rows × 41 columns

\n", + "
" + ], + "text/plain": [ + " id_products amount is_full_price representation_id_merge2 \\\n", + "0 10682 9.0 False 914 \n", + "1 478 9.5 False 273 \n", + "2 20873 11.5 False 275 \n", + "3 157142 8.0 False 82519 \n", + "4 1341 8.5 False 9 \n", + "\n", + " pricing_formula_id_merge2 created_at_products \\\n", + "0 114 2020-09-03 14:09:43.119798+02:00 \n", + "1 131 2020-09-03 13:21:22.711773+02:00 \n", + "2 137 2020-09-03 14:46:33.589030+02:00 \n", + "3 9 2022-01-28 19:29:23.525722+01:00 \n", + "4 93 2020-09-03 13:29:30.773089+02:00 \n", + "\n", + " updated_at_products category_id_merge2 apply_price \\\n", + "0 2020-09-03 14:09:43.119798+02:00 41 0.0 \n", + "1 2020-09-03 13:21:22.711773+02:00 1 0.0 \n", + "2 2020-09-03 14:46:33.589030+02:00 1 0.0 \n", + "3 2022-01-28 19:29:23.525722+01:00 5 0.0 \n", + "4 2020-09-03 13:29:30.773089+02:00 1 0.0 \n", + "\n", + " products_group_id ... pricing_formula_id \\\n", + "0 10655 ... 114.0 \n", + "1 471 ... 131.0 \n", + "2 20825 ... 137.0 \n", + "3 156773 ... 9.0 \n", + "4 1175 ... 93.0 \n", + "\n", + " created_at_type_of_pricing_f updated_at_type_of_pricing_f \\\n", + "0 2021-02-15 17:02:27.395376+01:00 2021-02-15 17:02:27.395376+01:00 \n", + "1 2021-02-05 11:52:05.923905+01:00 2021-02-05 11:52:05.923905+01:00 \n", + "2 2021-02-05 11:52:05.939898+01:00 2021-02-05 11:52:05.939898+01:00 \n", + "3 2021-02-05 11:52:06.107939+01:00 2021-02-05 11:52:06.107939+01:00 \n", + "4 2021-02-05 11:52:06.004162+01:00 2021-02-05 11:52:06.004162+01:00 \n", + "\n", + " identifier_merge4 id name_product_pack type_of \\\n", + "0 3706121eb9f43b635bef1433c06f679c 1 NaN 0 \n", + "1 0aceb248607671792298436004b95275 1 NaN 0 \n", + "2 93002d4637331edd81ffc28b6e8e89c0 1 NaN 0 \n", + "3 7d0b25bdfff9f366da8be820608c8191 1 NaN 0 \n", + "4 1dbb0795e8f47cb75ba7cdb08c06be5f 1 NaN 0 \n", + "\n", + " created_at updated_at \\\n", + "0 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "1 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "2 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "3 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "4 2020-09-03 13:11:24.501197+02:00 2020-09-03 13:11:24.501197+02:00 \n", + "\n", + " identifier_product_pack \n", + "0 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "1 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "2 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "3 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "4 a764b4bf13a360c7ac2a35ec4ca96c95 \n", + "\n", + "[5 rows x 41 columns]" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_product_pricing.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03442997-806f-4285-a139-3bad46bb4522", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d22a0d75-53c5-4b54-9060-c9e7c307fb13", + "metadata": {}, "outputs": [], "source": [] }