events_theme #2
188
.gitignore
vendored
188
.gitignore
vendored
|
@ -1 +1,187 @@
|
||||||
.ipynb_checkpoints/Clean-Notebook-checkpoint.ipynb
|
# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
|
||||||
|
# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,python
|
||||||
|
|
||||||
|
### JupyterNotebooks ###
|
||||||
|
# gitignore template for Jupyter Notebooks
|
||||||
|
# website: http://jupyter.org/
|
||||||
|
|
||||||
|
.ipynb_checkpoints
|
||||||
|
*/.ipynb_checkpoints/*
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# Remove previous ipynb_checkpoints
|
||||||
|
# git rm -r .ipynb_checkpoints/
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
### Python Patch ###
|
||||||
|
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||||
|
poetry.toml
|
||||||
|
|
||||||
|
# ruff
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# LSP config files
|
||||||
|
pyrightconfig.json
|
||||||
|
|
||||||
|
# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
|
187
.gitignore.txt
187
.gitignore.txt
|
@ -1,187 +0,0 @@
|
||||||
# Created by https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
|
|
||||||
# Edit at https://www.toptal.com/developers/gitignore?templates=jupyternotebooks,python
|
|
||||||
|
|
||||||
### JupyterNotebooks ###
|
|
||||||
# gitignore template for Jupyter Notebooks
|
|
||||||
# website: http://jupyter.org/
|
|
||||||
|
|
||||||
.ipynb_checkpoints
|
|
||||||
*/.ipynb_checkpoints/*
|
|
||||||
|
|
||||||
# IPython
|
|
||||||
profile_default/
|
|
||||||
ipython_config.py
|
|
||||||
|
|
||||||
# Remove previous ipynb_checkpoints
|
|
||||||
# git rm -r .ipynb_checkpoints/
|
|
||||||
|
|
||||||
### Python ###
|
|
||||||
# Byte-compiled / optimized / DLL files
|
|
||||||
__pycache__/
|
|
||||||
*.py[cod]
|
|
||||||
*$py.class
|
|
||||||
|
|
||||||
# C extensions
|
|
||||||
*.so
|
|
||||||
|
|
||||||
# Distribution / packaging
|
|
||||||
.Python
|
|
||||||
build/
|
|
||||||
develop-eggs/
|
|
||||||
dist/
|
|
||||||
downloads/
|
|
||||||
eggs/
|
|
||||||
.eggs/
|
|
||||||
lib/
|
|
||||||
lib64/
|
|
||||||
parts/
|
|
||||||
sdist/
|
|
||||||
var/
|
|
||||||
wheels/
|
|
||||||
share/python-wheels/
|
|
||||||
*.egg-info/
|
|
||||||
.installed.cfg
|
|
||||||
*.egg
|
|
||||||
MANIFEST
|
|
||||||
|
|
||||||
# PyInstaller
|
|
||||||
# Usually these files are written by a python script from a template
|
|
||||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
||||||
*.manifest
|
|
||||||
*.spec
|
|
||||||
|
|
||||||
# Installer logs
|
|
||||||
pip-log.txt
|
|
||||||
pip-delete-this-directory.txt
|
|
||||||
|
|
||||||
# Unit test / coverage reports
|
|
||||||
htmlcov/
|
|
||||||
.tox/
|
|
||||||
.nox/
|
|
||||||
.coverage
|
|
||||||
.coverage.*
|
|
||||||
.cache
|
|
||||||
nosetests.xml
|
|
||||||
coverage.xml
|
|
||||||
*.cover
|
|
||||||
*.py,cover
|
|
||||||
.hypothesis/
|
|
||||||
.pytest_cache/
|
|
||||||
cover/
|
|
||||||
|
|
||||||
# Translations
|
|
||||||
*.mo
|
|
||||||
*.pot
|
|
||||||
|
|
||||||
# Django stuff:
|
|
||||||
*.log
|
|
||||||
local_settings.py
|
|
||||||
db.sqlite3
|
|
||||||
db.sqlite3-journal
|
|
||||||
|
|
||||||
# Flask stuff:
|
|
||||||
instance/
|
|
||||||
.webassets-cache
|
|
||||||
|
|
||||||
# Scrapy stuff:
|
|
||||||
.scrapy
|
|
||||||
|
|
||||||
# Sphinx documentation
|
|
||||||
docs/_build/
|
|
||||||
|
|
||||||
# PyBuilder
|
|
||||||
.pybuilder/
|
|
||||||
target/
|
|
||||||
|
|
||||||
# Jupyter Notebook
|
|
||||||
|
|
||||||
# IPython
|
|
||||||
|
|
||||||
# pyenv
|
|
||||||
# For a library or package, you might want to ignore these files since the code is
|
|
||||||
# intended to run in multiple environments; otherwise, check them in:
|
|
||||||
# .python-version
|
|
||||||
|
|
||||||
# pipenv
|
|
||||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
||||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
||||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
||||||
# install all needed dependencies.
|
|
||||||
#Pipfile.lock
|
|
||||||
|
|
||||||
# poetry
|
|
||||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
||||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
||||||
# commonly ignored for libraries.
|
|
||||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
||||||
#poetry.lock
|
|
||||||
|
|
||||||
# pdm
|
|
||||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
||||||
#pdm.lock
|
|
||||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
||||||
# in version control.
|
|
||||||
# https://pdm.fming.dev/#use-with-ide
|
|
||||||
.pdm.toml
|
|
||||||
|
|
||||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
||||||
__pypackages__/
|
|
||||||
|
|
||||||
# Celery stuff
|
|
||||||
celerybeat-schedule
|
|
||||||
celerybeat.pid
|
|
||||||
|
|
||||||
# SageMath parsed files
|
|
||||||
*.sage.py
|
|
||||||
|
|
||||||
# Environments
|
|
||||||
.env
|
|
||||||
.venv
|
|
||||||
env/
|
|
||||||
venv/
|
|
||||||
ENV/
|
|
||||||
env.bak/
|
|
||||||
venv.bak/
|
|
||||||
|
|
||||||
# Spyder project settings
|
|
||||||
.spyderproject
|
|
||||||
.spyproject
|
|
||||||
|
|
||||||
# Rope project settings
|
|
||||||
.ropeproject
|
|
||||||
|
|
||||||
# mkdocs documentation
|
|
||||||
/site
|
|
||||||
|
|
||||||
# mypy
|
|
||||||
.mypy_cache/
|
|
||||||
.dmypy.json
|
|
||||||
dmypy.json
|
|
||||||
|
|
||||||
# Pyre type checker
|
|
||||||
.pyre/
|
|
||||||
|
|
||||||
# pytype static type analyzer
|
|
||||||
.pytype/
|
|
||||||
|
|
||||||
# Cython debug symbols
|
|
||||||
cython_debug/
|
|
||||||
|
|
||||||
# PyCharm
|
|
||||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
||||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
||||||
#.idea/
|
|
||||||
|
|
||||||
### Python Patch ###
|
|
||||||
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
|
||||||
poetry.toml
|
|
||||||
|
|
||||||
# ruff
|
|
||||||
.ruff_cache/
|
|
||||||
|
|
||||||
# LSP config files
|
|
||||||
pyrightconfig.json
|
|
||||||
|
|
||||||
# End of https://www.toptal.com/developers/gitignore/api/jupyternotebooks,python
|
|
|
@ -1,76 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['bdc2324-data/1',\n",
|
|
||||||
" 'bdc2324-data/10',\n",
|
|
||||||
" 'bdc2324-data/101',\n",
|
|
||||||
" 'bdc2324-data/11',\n",
|
|
||||||
" 'bdc2324-data/12',\n",
|
|
||||||
" 'bdc2324-data/13',\n",
|
|
||||||
" 'bdc2324-data/14',\n",
|
|
||||||
" 'bdc2324-data/2',\n",
|
|
||||||
" 'bdc2324-data/3',\n",
|
|
||||||
" 'bdc2324-data/4',\n",
|
|
||||||
" 'bdc2324-data/5',\n",
|
|
||||||
" 'bdc2324-data/6',\n",
|
|
||||||
" 'bdc2324-data/7',\n",
|
|
||||||
" 'bdc2324-data/8',\n",
|
|
||||||
" 'bdc2324-data/9']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import s3fs\n",
|
|
||||||
"\n",
|
|
||||||
"# Create filesystem object\n",
|
|
||||||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
||||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
|
|
||||||
"\n",
|
|
||||||
"BUCKET = \"bdc2324-data\"\n",
|
|
||||||
"fs.ls(BUCKET)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "023bfa2b-97c2-4d53-80fb-e2290c73b92f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
1465
0_Cleaning_and_merge.ipynb
Normal file
1465
0_Cleaning_and_merge.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
695
Brouillon_AJ.ipynb
Normal file
695
Brouillon_AJ.ipynb
Normal file
|
@ -0,0 +1,695 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Business Data Challenge - Team 1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Configuration de l'accès aux données"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import s3fs\n",
|
||||||
|
"# Create filesystem object\n",
|
||||||
|
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
||||||
|
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
|
||||||
|
"\n",
|
||||||
|
"BUCKET = \"bdc2324-data\"\n",
|
||||||
|
"fs.ls(BUCKET)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Chargement des fichiers campaign_stats.csv\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Conversion des dates 'sent_at'\n",
|
||||||
|
"campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
|
||||||
|
"campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
|
||||||
|
"campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
|
||||||
|
"print(campaign_stats_1['sent_at'].max())\n",
|
||||||
|
"print(campaign_stats_1['sent_at'].min())\n",
|
||||||
|
"\n",
|
||||||
|
"print(campaign_stats_2['sent_at'].max())\n",
|
||||||
|
"print(campaign_stats_2['sent_at'].min())\n",
|
||||||
|
"\n",
|
||||||
|
"print(campaign_stats_3['sent_at'].max())\n",
|
||||||
|
"print(campaign_stats_3['sent_at'].min())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"campaign_stats_1['sent_at']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "31f2edbf-5661-4516-9835-06d4da615c13",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Customersplus.csv"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "460f853a-68c0-42a7-9877-b83d3aaec813",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"customers_plus_1.columns"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"customers_plus_1.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"customers_plus_1['id'].nunique()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"customers_plus_2['id'].nunique()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b40a653e-013f-48d0-8b57-0284587b36c5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "32fa2215-3c79-40b5-8643-755865959fc7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
|
||||||
|
"# Exemple id commun = caractéristiques communes\n",
|
||||||
|
"print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
|
||||||
|
"\n",
|
||||||
|
"print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"customers_plus_1.isna().mean()*100"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6f6ce60d-0912-497d-9108-330acccef394",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Chargement de toutes les données\n",
|
||||||
|
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
|
||||||
|
"\n",
|
||||||
|
"for nom_base in liste_base:\n",
|
||||||
|
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
|
||||||
|
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Jointure\n",
|
||||||
|
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
|
||||||
|
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
|
||||||
|
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
|
||||||
|
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
|
||||||
|
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
|
||||||
|
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
|
||||||
|
"df_customer_event"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Fusion et exploration"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "22bfad2b-d52a-4077-9b39-bee35004e01c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Jointure\n",
|
||||||
|
"var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n",
|
||||||
|
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n",
|
||||||
|
"\n",
|
||||||
|
"var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n",
|
||||||
|
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n",
|
||||||
|
"\n",
|
||||||
|
"var_choosed.remove('representation_id')\n",
|
||||||
|
"var_choosed.extend(['start_date_time', 'event_id'])\n",
|
||||||
|
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n",
|
||||||
|
"\n",
|
||||||
|
"var_choosed.remove('event_id')\n",
|
||||||
|
"var_choosed.extend(['name', 'customer_id'])\n",
|
||||||
|
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n",
|
||||||
|
"\n",
|
||||||
|
"# Changement de nom\n",
|
||||||
|
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
|
||||||
|
"var_choosed[var_choosed.index('name')] = \"event_name\"\n",
|
||||||
|
"\n",
|
||||||
|
"# Base finale\n",
|
||||||
|
"var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n",
|
||||||
|
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n",
|
||||||
|
"df_customer_event"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Type de client au globale"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Client\n",
|
||||||
|
"print(customer_target_mappings.columns)\n",
|
||||||
|
"print(customer_target_mappings.shape)\n",
|
||||||
|
"customer_target_mappings.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"customer_target_mappings['extra_field'].unique()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"customer_target_mappings['name'].unique()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Segmentation existante\n",
|
||||||
|
"print(target_types.columns)\n",
|
||||||
|
"print(target_types.shape)\n",
|
||||||
|
"target_types.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5adb1773-648d-4683-bc08-d1f2298c1283",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"target_types"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3d65f74e-47fc-4296-b493-a1ebefb91cde",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Tags = clients\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" tags = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(tags.columns)\n",
|
||||||
|
"print(tags.shape)\n",
|
||||||
|
"tags.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "8a689a63-165b-4c4e-bbb0-695b661048d9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tags"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "69e38c52-0570-4531-aebb-9deb6db8c40b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Structure = clients\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(structure_tag_mappings.columns)\n",
|
||||||
|
"print(structure_tag_mappings.shape)\n",
|
||||||
|
"structure_tag_mappings.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "74dc34ad-375b-48df-a900-40d92c5fff13",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"structure_tag_mappings"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Tags = clients\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" customersplus = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(customersplus.columns)\n",
|
||||||
|
"print(customersplus.shape)\n",
|
||||||
|
"customersplus.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "383e892c-606a-45ce-bdd6-b503b3e0be33",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"customersplus"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "70324d06-b855-4386-a7de-eef1eb13dfdf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# But : lier les caractéristiques socio-demo et les comportements d'achat\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# tickets\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" tickets = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(tickets.columns)\n",
|
||||||
|
"print(tickets.shape)\n",
|
||||||
|
"tickets.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "ea83ea5c-3d47-4a66-a523-04b69b149a20",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tickets"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tickets['type_of'].unique()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "bc192b08-30a5-486a-8bea-93e765dbfce6",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Types d'évenement et client"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e14dcf62-2def-4ed5-834b-cf21abbc2894",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Evenement = events.csv\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" events = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(events.columns)\n",
|
||||||
|
"print(events.shape)\n",
|
||||||
|
"events.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"events"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "af80eee8-f717-4159-a0fd-09d47ec96621",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"events['name'].nunique()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Représentation des évenements = representations.csv\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" representations = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(representations.columns)\n",
|
||||||
|
"print(representations.shape)\n",
|
||||||
|
"representations.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "1487402a-a49b-4737-b7d7-40c764d2f0b4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"representations"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "99b27418-2c15-4a6e-bcf5-d329ca492085",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Produits vendues = products.csv\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" products = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(products.columns)\n",
|
||||||
|
"print(products.shape)\n",
|
||||||
|
"products.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c49bcd47-672f-4e0f-aee9-a7475151b97f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"products"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a4aec5ce-d0c9-4625-bb29-9ac154818621",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Lieu = facilities.csv\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" facilities = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(facilities.columns)\n",
|
||||||
|
"print(facilities.shape)\n",
|
||||||
|
"facilities.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b3642483-2879-442a-ad69-efcd2331a200",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"facilities"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "da1e9807-2a8d-4be7-a785-55cffd734f36",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Saisons = seasons.csv période sur deux années consécutives\n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" seasons = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(seasons.columns)\n",
|
||||||
|
"print(seasons.shape)\n",
|
||||||
|
"seasons.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"seasons['name'].unique()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "abb3aa20-774b-4761-983a-df5eb2bc51c6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Achats = purchases.csv \n",
|
||||||
|
"FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||||
|
" purchases = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
|
"\n",
|
||||||
|
"print(purchases.columns)\n",
|
||||||
|
"print(purchases.shape)\n",
|
||||||
|
"purchases.info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "30e204ab-4f63-430c-a818-5c8035b6e17b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"purchases"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
3921
Clean-Notebook.ipynb
3921
Clean-Notebook.ipynb
File diff suppressed because it is too large
Load Diff
3406
Exploration_billet_AJ.ipynb
Normal file
3406
Exploration_billet_AJ.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -1,823 +0,0 @@
|
||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Business Data Challenge - Team 1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "88af2795-8bf9-4df0-a059-be7c28fb4289",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import numpy as np"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"Configuration de l'accès aux données"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"['bdc2324-data/1',\n",
|
|
||||||
" 'bdc2324-data/10',\n",
|
|
||||||
" 'bdc2324-data/101',\n",
|
|
||||||
" 'bdc2324-data/11',\n",
|
|
||||||
" 'bdc2324-data/12',\n",
|
|
||||||
" 'bdc2324-data/13',\n",
|
|
||||||
" 'bdc2324-data/14',\n",
|
|
||||||
" 'bdc2324-data/2',\n",
|
|
||||||
" 'bdc2324-data/3',\n",
|
|
||||||
" 'bdc2324-data/4',\n",
|
|
||||||
" 'bdc2324-data/5',\n",
|
|
||||||
" 'bdc2324-data/6',\n",
|
|
||||||
" 'bdc2324-data/7',\n",
|
|
||||||
" 'bdc2324-data/8',\n",
|
|
||||||
" 'bdc2324-data/9']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"import s3fs\n",
|
|
||||||
"# Create filesystem object\n",
|
|
||||||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
||||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
|
|
||||||
"\n",
|
|
||||||
"BUCKET = \"bdc2324-data\"\n",
|
|
||||||
"fs.ls(BUCKET)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Chargement des fichiers campaign_stats.csv\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Conversion des dates 'sent_at'\n",
|
|
||||||
"campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
|
|
||||||
"campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
|
|
||||||
"campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"2023-11-09 18:10:45+00:00\n",
|
|
||||||
"2020-06-02 08:24:08+00:00\n",
|
|
||||||
"2023-10-12 01:39:48+00:00\n",
|
|
||||||
"2023-10-10 17:06:29+00:00\n",
|
|
||||||
"2023-11-01 09:20:48+00:00\n",
|
|
||||||
"2021-03-31 14:59:02+00:00\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
|
|
||||||
"print(campaign_stats_1['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_1['sent_at'].min())\n",
|
|
||||||
"\n",
|
|
||||||
"print(campaign_stats_2['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_2['sent_at'].min())\n",
|
|
||||||
"\n",
|
|
||||||
"print(campaign_stats_3['sent_at'].max())\n",
|
|
||||||
"print(campaign_stats_3['sent_at'].min())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"id": "77894273-b3e5-4f29-bd63-9f4df8082b9b",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"0 2021-03-28 16:01:09+00:00\n",
|
|
||||||
"1 2021-03-28 16:01:09+00:00\n",
|
|
||||||
"2 2021-03-28 16:00:59+00:00\n",
|
|
||||||
"3 2021-03-28 16:00:59+00:00\n",
|
|
||||||
"4 2021-03-28 16:01:06+00:00\n",
|
|
||||||
" ... \n",
|
|
||||||
"6214803 2023-10-23 09:32:33+00:00\n",
|
|
||||||
"6214804 2023-10-23 09:32:49+00:00\n",
|
|
||||||
"6214805 2023-10-23 09:33:28+00:00\n",
|
|
||||||
"6214806 2023-10-23 09:31:53+00:00\n",
|
|
||||||
"6214807 2023-10-23 09:33:54+00:00\n",
|
|
||||||
"Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"campaign_stats_1['sent_at']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"id": "31f2edbf-5661-4516-9835-06d4da615c13",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"### Customersplus.csv"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
||||||
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
|
|
||||||
"\n",
|
|
||||||
"FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
|
|
||||||
"\n",
|
|
||||||
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"id": "460f853a-68c0-42a7-9877-b83d3aaec813",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
|
|
||||||
" 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
|
|
||||||
" 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
|
|
||||||
" 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
|
|
||||||
" 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
|
|
||||||
" 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
|
|
||||||
" 'average_purchase_delay', 'average_price_basket',\n",
|
|
||||||
" 'average_ticket_basket', 'total_price', 'preferred_category',\n",
|
|
||||||
" 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
|
|
||||||
" 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
|
|
||||||
" 'tenant_id'],\n",
|
|
||||||
" dtype='object')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1.columns"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "d5a9398f-72fc-4548-9f53-b20b372144b2",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1.shape"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "7467ddbe-0bd4-44cc-8a16-84aa41853638",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_1['id'].nunique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "e15f05f8-3a89-4fc3-84a9-dae70e168440",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"customers_plus_2['id'].nunique()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "b40a653e-013f-48d0-8b57-0284587b36c5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 61,
|
|
||||||
"id": "32fa2215-3c79-40b5-8643-755865959fc7",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 61,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
|
|
||||||
"# Exemple id commun = caractéristiques communes\n",
|
|
||||||
"print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
|
|
||||||
"\n",
|
|
||||||
"print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 49,
|
|
||||||
"id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d",
|
|
||||||
"metadata": {
|
|
||||||
"scrolled": true
|
|
||||||
},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"id 0.000000\n",
|
|
||||||
"lastname 43.461341\n",
|
|
||||||
"firstname 44.995588\n",
|
|
||||||
"birthdate 96.419870\n",
|
|
||||||
"email 8.622075\n",
|
|
||||||
"street_id 0.000000\n",
|
|
||||||
"created_at 0.000000\n",
|
|
||||||
"updated_at 0.000000\n",
|
|
||||||
"civility 100.000000\n",
|
|
||||||
"is_partner 0.000000\n",
|
|
||||||
"extra 100.000000\n",
|
|
||||||
"deleted_at 100.000000\n",
|
|
||||||
"reference 100.000000\n",
|
|
||||||
"gender 0.000000\n",
|
|
||||||
"is_email_true 0.000000\n",
|
|
||||||
"extra_field 100.000000\n",
|
|
||||||
"identifier 0.000000\n",
|
|
||||||
"opt_in 0.000000\n",
|
|
||||||
"structure_id 88.072380\n",
|
|
||||||
"note 99.403421\n",
|
|
||||||
"profession 95.913503\n",
|
|
||||||
"language 99.280945\n",
|
|
||||||
"mcp_contact_id 34.876141\n",
|
|
||||||
"need_reload 0.000000\n",
|
|
||||||
"last_buying_date 51.653431\n",
|
|
||||||
"max_price 51.653431\n",
|
|
||||||
"ticket_sum 0.000000\n",
|
|
||||||
"average_price 8.639195\n",
|
|
||||||
"fidelity 0.000000\n",
|
|
||||||
"average_purchase_delay 51.653431\n",
|
|
||||||
"average_price_basket 51.653431\n",
|
|
||||||
"average_ticket_basket 51.653431\n",
|
|
||||||
"total_price 43.014236\n",
|
|
||||||
"preferred_category 100.000000\n",
|
|
||||||
"preferred_supplier 100.000000\n",
|
|
||||||
"preferred_formula 100.000000\n",
|
|
||||||
"purchase_count 0.000000\n",
|
|
||||||
"first_buying_date 51.653431\n",
|
|
||||||
"last_visiting_date 100.000000\n",
|
|
||||||
"zipcode 71.176564\n",
|
|
||||||
"country 5.459418\n",
|
|
||||||
"age 96.419870\n",
|
|
||||||
"tenant_id 0.000000\n",
|
|
||||||
"dtype: float64\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"pd.DataFrame(customers_plus_1.isna().mean()*100)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 11,
|
|
||||||
"id": "6f6ce60d-0912-497d-9108-330acccef394",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Chargement de toutes les données\n",
|
|
||||||
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
|
|
||||||
"\n",
|
|
||||||
"for nom_base in liste_base:\n",
|
|
||||||
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
|
|
||||||
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
||||||
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/html": [
|
|
||||||
"<div>\n",
|
|
||||||
"<style scoped>\n",
|
|
||||||
" .dataframe tbody tr th:only-of-type {\n",
|
|
||||||
" vertical-align: middle;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe tbody tr th {\n",
|
|
||||||
" vertical-align: top;\n",
|
|
||||||
" }\n",
|
|
||||||
"\n",
|
|
||||||
" .dataframe thead th {\n",
|
|
||||||
" text-align: right;\n",
|
|
||||||
" }\n",
|
|
||||||
"</style>\n",
|
|
||||||
"<table border=\"1\" class=\"dataframe\">\n",
|
|
||||||
" <thead>\n",
|
|
||||||
" <tr style=\"text-align: right;\">\n",
|
|
||||||
" <th></th>\n",
|
|
||||||
" <th>id</th>\n",
|
|
||||||
" <th>lastname</th>\n",
|
|
||||||
" <th>firstname</th>\n",
|
|
||||||
" <th>birthdate</th>\n",
|
|
||||||
" <th>email</th>\n",
|
|
||||||
" <th>street_id</th>\n",
|
|
||||||
" <th>created_at</th>\n",
|
|
||||||
" <th>updated_at</th>\n",
|
|
||||||
" <th>civility</th>\n",
|
|
||||||
" <th>is_partner</th>\n",
|
|
||||||
" <th>...</th>\n",
|
|
||||||
" <th>tenant_id</th>\n",
|
|
||||||
" <th>id_x</th>\n",
|
|
||||||
" <th>customer_id</th>\n",
|
|
||||||
" <th>purchase_date</th>\n",
|
|
||||||
" <th>type_of</th>\n",
|
|
||||||
" <th>is_from_subscription</th>\n",
|
|
||||||
" <th>amount</th>\n",
|
|
||||||
" <th>is_full_price</th>\n",
|
|
||||||
" <th>start_date_time</th>\n",
|
|
||||||
" <th>event_name</th>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </thead>\n",
|
|
||||||
" <tbody>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>0</th>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>lastname405082</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>992423</td>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>2023-01-11 17:08:41+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>13.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-02-06 20:00:00+01:00</td>\n",
|
|
||||||
" <td>zaide</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>1</th>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>lastname405082</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>992423</td>\n",
|
|
||||||
" <td>405082</td>\n",
|
|
||||||
" <td>2023-01-11 17:08:41+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>13.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-02-06 20:00:00+01:00</td>\n",
|
|
||||||
" <td>zaide</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>2</th>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>lastname411168</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1053934</td>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>2023-03-16 16:23:10+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>62.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-03-19 16:00:00+01:00</td>\n",
|
|
||||||
" <td>luisa miller</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>3</th>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>lastname411168</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1053934</td>\n",
|
|
||||||
" <td>411168</td>\n",
|
|
||||||
" <td>2023-03-16 16:23:10+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>62.0</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2023-03-19 16:00:00+01:00</td>\n",
|
|
||||||
" <td>luisa miller</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>4</th>\n",
|
|
||||||
" <td>4380</td>\n",
|
|
||||||
" <td>lastname4380</td>\n",
|
|
||||||
" <td>firstname4380</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>2021-04-22 14:51:55.432952+02:00</td>\n",
|
|
||||||
" <td>2022-04-14 11:41:33.738500+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1189141</td>\n",
|
|
||||||
" <td>4380</td>\n",
|
|
||||||
" <td>2020-11-26 13:12:53+01:00</td>\n",
|
|
||||||
" <td>3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>51.3</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2020-12-01 20:00:00+01:00</td>\n",
|
|
||||||
" <td>iphigenie en tauride</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>...</th>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318964</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1090839</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>4.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
|
||||||
" <td>entre femmes</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318965</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1090839</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>4.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
|
||||||
" <td>entre femmes</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318966</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1090839</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-05-19 21:18:36+02:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>4.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2019-05-27 20:00:00+02:00</td>\n",
|
|
||||||
" <td>entre femmes</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318967</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1244277</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-12-31 11:04:07+01:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>5.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2020-02-03 20:00:00+01:00</td>\n",
|
|
||||||
" <td>a boire et a manger</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" <tr>\n",
|
|
||||||
" <th>318968</th>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>lastname19095</td>\n",
|
|
||||||
" <td>firstname19095</td>\n",
|
|
||||||
" <td>1979-07-16</td>\n",
|
|
||||||
" <td>email19095</td>\n",
|
|
||||||
" <td>6</td>\n",
|
|
||||||
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
|
|
||||||
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
|
|
||||||
" <td>NaN</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>...</td>\n",
|
|
||||||
" <td>1556</td>\n",
|
|
||||||
" <td>1244277</td>\n",
|
|
||||||
" <td>19095</td>\n",
|
|
||||||
" <td>2019-12-31 11:04:07+01:00</td>\n",
|
|
||||||
" <td>1</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>5.5</td>\n",
|
|
||||||
" <td>False</td>\n",
|
|
||||||
" <td>2020-02-03 20:00:00+01:00</td>\n",
|
|
||||||
" <td>a boire et a manger</td>\n",
|
|
||||||
" </tr>\n",
|
|
||||||
" </tbody>\n",
|
|
||||||
"</table>\n",
|
|
||||||
"<p>318969 rows × 52 columns</p>\n",
|
|
||||||
"</div>"
|
|
||||||
],
|
|
||||||
"text/plain": [
|
|
||||||
" id lastname firstname birthdate email \\\n",
|
|
||||||
"0 405082 lastname405082 NaN NaN NaN \n",
|
|
||||||
"1 405082 lastname405082 NaN NaN NaN \n",
|
|
||||||
"2 411168 lastname411168 NaN NaN NaN \n",
|
|
||||||
"3 411168 lastname411168 NaN NaN NaN \n",
|
|
||||||
"4 4380 lastname4380 firstname4380 NaN NaN \n",
|
|
||||||
"... ... ... ... ... ... \n",
|
|
||||||
"318964 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318965 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318966 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318967 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"318968 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
|
|
||||||
"\n",
|
|
||||||
" street_id created_at \\\n",
|
|
||||||
"0 6 2023-01-12 06:30:31.197484+01:00 \n",
|
|
||||||
"1 6 2023-01-12 06:30:31.197484+01:00 \n",
|
|
||||||
"2 6 2023-03-17 06:30:35.431967+01:00 \n",
|
|
||||||
"3 6 2023-03-17 06:30:35.431967+01:00 \n",
|
|
||||||
"4 1 2021-04-22 14:51:55.432952+02:00 \n",
|
|
||||||
"... ... ... \n",
|
|
||||||
"318964 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318965 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318966 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318967 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"318968 6 2021-04-22 15:06:30.120537+02:00 \n",
|
|
||||||
"\n",
|
|
||||||
" updated_at civility is_partner ... \\\n",
|
|
||||||
"0 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
|
|
||||||
"1 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
|
|
||||||
"2 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
|
|
||||||
"3 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
|
|
||||||
"4 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
|
|
||||||
"... ... ... ... ... \n",
|
|
||||||
"318964 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318965 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318966 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318967 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"318968 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
|
|
||||||
"\n",
|
|
||||||
" tenant_id id_x customer_id purchase_date type_of \\\n",
|
|
||||||
"0 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
|
|
||||||
"1 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
|
|
||||||
"2 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
|
|
||||||
"3 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
|
|
||||||
"4 1556 1189141 4380 2020-11-26 13:12:53+01:00 3 \n",
|
|
||||||
"... ... ... ... ... ... \n",
|
|
||||||
"318964 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
|
|
||||||
"318965 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
|
|
||||||
"318966 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
|
|
||||||
"318967 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
|
|
||||||
"318968 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
|
|
||||||
"\n",
|
|
||||||
" is_from_subscription amount is_full_price start_date_time \\\n",
|
|
||||||
"0 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
|
|
||||||
"1 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
|
|
||||||
"2 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
|
|
||||||
"3 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
|
|
||||||
"4 False 51.3 False 2020-12-01 20:00:00+01:00 \n",
|
|
||||||
"... ... ... ... ... \n",
|
|
||||||
"318964 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
|
|
||||||
"318965 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
|
|
||||||
"318966 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
|
|
||||||
"318967 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
|
|
||||||
"318968 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
|
|
||||||
"\n",
|
|
||||||
" event_name \n",
|
|
||||||
"0 zaide \n",
|
|
||||||
"1 zaide \n",
|
|
||||||
"2 luisa miller \n",
|
|
||||||
"3 luisa miller \n",
|
|
||||||
"4 iphigenie en tauride \n",
|
|
||||||
"... ... \n",
|
|
||||||
"318964 entre femmes \n",
|
|
||||||
"318965 entre femmes \n",
|
|
||||||
"318966 entre femmes \n",
|
|
||||||
"318967 a boire et a manger \n",
|
|
||||||
"318968 a boire et a manger \n",
|
|
||||||
"\n",
|
|
||||||
"[318969 rows x 52 columns]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"# Jointure\n",
|
|
||||||
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
|
|
||||||
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
|
|
||||||
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
|
|
||||||
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
|
|
||||||
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
|
|
||||||
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
|
|
||||||
"df_customer_event"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.13"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
|
@ -6103,6 +6103,403 @@
|
||||||
"representation_theme.head()"
|
"representation_theme.head()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e274e3cc-1b41-43e0-8412-1563166060cb",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Price Table"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 112,
|
||||||
|
"id": "c52621e7-01de-48dc-b572-2974542a8be5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"File path : bdc2324-data/1/1product_packs.csv\n",
|
||||||
|
"Shape : (1, 6)\n",
|
||||||
|
"Number of columns : 4\n",
|
||||||
|
"Columns : Index(['id', 'identifier', 'name', 'type_of'], dtype='object')\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>id</th>\n",
|
||||||
|
" <th>name</th>\n",
|
||||||
|
" <th>type_of</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" id name type_of\n",
|
||||||
|
"0 1 NaN 0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 112,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"product_packs = load_dataset(\"1product_packs.csv\")\n",
|
||||||
|
"product_packs.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 114,
|
||||||
|
"id": "9e4f60ab-9a2c-4090-b0c4-f9a1530b2d39",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"File path : bdc2324-data/1/1pricing_formulas.csv\n",
|
||||||
|
"Shape : (556, 6)\n",
|
||||||
|
"Number of columns : 4\n",
|
||||||
|
"Columns : Index(['id', 'identifier', 'name', 'extra_field'], dtype='object')\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>id</th>\n",
|
||||||
|
" <th>name</th>\n",
|
||||||
|
" <th>extra_field</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>41909</td>\n",
|
||||||
|
" <td>visite mécènes 1h30</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>502</td>\n",
|
||||||
|
" <td>entree mucem tp( expo picasso)</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>504</td>\n",
|
||||||
|
" <td>nombre de personnes cinema</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>117</td>\n",
|
||||||
|
" <td>spectacle tarif e famille tr</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>1496</td>\n",
|
||||||
|
" <td>billet nb famille mecene 1a</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" id name extra_field\n",
|
||||||
|
"0 41909 visite mécènes 1h30 NaN\n",
|
||||||
|
"1 502 entree mucem tp( expo picasso) NaN\n",
|
||||||
|
"2 504 nombre de personnes cinema NaN\n",
|
||||||
|
"3 117 spectacle tarif e famille tr NaN\n",
|
||||||
|
"4 1496 billet nb famille mecene 1a NaN"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 114,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pricing_formula = load_dataset(\"1pricing_formulas.csv\")\n",
|
||||||
|
"pricing_formula.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 115,
|
||||||
|
"id": "247b5c45-a18a-4cfd-86b4-d3453e157bcd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"File path : bdc2324-data/1/1type_of_pricing_formulas.csv\n",
|
||||||
|
"Shape : (568, 6)\n",
|
||||||
|
"Number of columns : 4\n",
|
||||||
|
"Columns : Index(['id', 'type_of_id', 'pricing_formula_id', 'identifier'], dtype='object')\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>id</th>\n",
|
||||||
|
" <th>type_of_id</th>\n",
|
||||||
|
" <th>pricing_formula_id</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>127</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>2425</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>2937</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>48</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>5</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>7</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" id type_of_id pricing_formula_id\n",
|
||||||
|
"0 1 1 127\n",
|
||||||
|
"1 2 1 2425\n",
|
||||||
|
"2 3 1 2937\n",
|
||||||
|
"3 4 1 48\n",
|
||||||
|
"4 5 1 7"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 115,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"type_pricing_formula = load_dataset(\"1type_of_pricing_formulas.csv\")\n",
|
||||||
|
"type_pricing_formula.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 117,
|
||||||
|
"id": "4b48f7b3-0f06-4ef6-9355-5016af82f49c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"File path : bdc2324-data/1/1products_groups.csv\n",
|
||||||
|
"Shape : (92973, 9)\n",
|
||||||
|
"Number of columns : 7\n",
|
||||||
|
"Columns : Index(['id', 'category_id', 'pricing_formula_id', 'representation_id',\n",
|
||||||
|
" 'percent_price', 'max_price', 'min_price'],\n",
|
||||||
|
" dtype='object')\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>id</th>\n",
|
||||||
|
" <th>category_id</th>\n",
|
||||||
|
" <th>pricing_formula_id</th>\n",
|
||||||
|
" <th>representation_id</th>\n",
|
||||||
|
" <th>percent_price</th>\n",
|
||||||
|
" <th>max_price</th>\n",
|
||||||
|
" <th>min_price</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>2735</td>\n",
|
||||||
|
" <td>8</td>\n",
|
||||||
|
" <td>97</td>\n",
|
||||||
|
" <td>1534</td>\n",
|
||||||
|
" <td>100.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>156773</td>\n",
|
||||||
|
" <td>5</td>\n",
|
||||||
|
" <td>9</td>\n",
|
||||||
|
" <td>82519</td>\n",
|
||||||
|
" <td>100.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>14387</td>\n",
|
||||||
|
" <td>16</td>\n",
|
||||||
|
" <td>79</td>\n",
|
||||||
|
" <td>8046</td>\n",
|
||||||
|
" <td>100.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>2770</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>37</td>\n",
|
||||||
|
" <td>1563</td>\n",
|
||||||
|
" <td>100.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>27179</td>\n",
|
||||||
|
" <td>13</td>\n",
|
||||||
|
" <td>119</td>\n",
|
||||||
|
" <td>14192</td>\n",
|
||||||
|
" <td>100.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" <td>0.0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" id category_id pricing_formula_id representation_id percent_price \\\n",
|
||||||
|
"0 2735 8 97 1534 100.0 \n",
|
||||||
|
"1 156773 5 9 82519 100.0 \n",
|
||||||
|
"2 14387 16 79 8046 100.0 \n",
|
||||||
|
"3 2770 2 37 1563 100.0 \n",
|
||||||
|
"4 27179 13 119 14192 100.0 \n",
|
||||||
|
"\n",
|
||||||
|
" max_price min_price \n",
|
||||||
|
"0 0.0 0.0 \n",
|
||||||
|
"1 0.0 0.0 \n",
|
||||||
|
"2 0.0 0.0 \n",
|
||||||
|
"3 0.0 0.0 \n",
|
||||||
|
"4 0.0 0.0 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 117,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"product_groups = load_dataset(\"1products_groups.csv\")\n",
|
||||||
|
"product_groups.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "71c26a38-6818-42df-8aee-0135681a5563",
|
"id": "71c26a38-6818-42df-8aee-0135681a5563",
|
||||||
|
@ -6741,6 +7138,9 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def uniform_product_df():\n",
|
"def uniform_product_df():\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" This function returns the uniform product dataset\n",
|
||||||
|
" \"\"\"\n",
|
||||||
" print(\"Products theme columns : \", products_theme.columns)\n",
|
" print(\"Products theme columns : \", products_theme.columns)\n",
|
||||||
" print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
|
" print(\"\\n Representation theme columns : \", representation_theme.columns)\n",
|
||||||
" print(\"\\n Events theme columns : \", events_theme.columns)\n",
|
" print(\"\\n Events theme columns : \", events_theme.columns)\n",
|
||||||
|
|
1760
TP_merge_tables_clean.ipynb
Normal file
1760
TP_merge_tables_clean.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user