BDC-team-1/1_Descriptive_Statistics_Museum.ipynb
2024-03-04 22:30:25 +00:00

3801 lines
451 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "3f41343f-7205-41d9-89dd-88039e301413",
"metadata": {},
"source": [
"# Statistiques descriptives"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "abfaf341-7b35-4407-9133-d21336c04027",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.dates as mdates\n",
"from datetime import datetime, date, timedelta\n",
"from dateutil.relativedelta import relativedelta\n",
"import warnings"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7fb72fa3-7940-496f-ac78-c2837f65eefa",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c34e13f4-e043-43d6-ba8c-2e13d008647c",
"metadata": {},
"outputs": [],
"source": [
"# Import cleaning and merge functions\n",
"exec(open('0_KPI_functions.py').read())\n",
"\n",
"# Useful functions :\n",
" # display_databases(directory_path, file_name = ['customerplus_cleaned', 'target_information', 'campaigns_information', 'products_purchased_reduced'], datetime_col = None)\n",
" # campaigns_kpi_function(campaigns_information = None)\n",
" # tickets_kpi_function(tickets_information = None)\n",
" # customerplus_kpi_function(customerplus_clean = None)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c60505f4-b95b-4c61-b842-26b27af7e280",
"metadata": {},
"outputs": [],
"source": [
"# set the max columns to none\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "aaffd291-2c88-44c8-a951-0ef1f8369ba3",
"metadata": {},
"outputs": [],
"source": [
"# Additional function to load initial \n",
"def load_dataset_2(directory_path, file_name):\n",
" \"\"\"\n",
" This function loads csv file\n",
" \"\"\"\n",
" file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
"\n",
" # drop na :\n",
" #df = df.dropna(axis=1, thresh=len(df))\n",
" # if identifier in table : delete it\n",
" if 'identifier' in df.columns:\n",
" df = df.drop(columns = 'identifier')\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "ae3c0c33-55a7-4a28-9a62-3ce13496917a",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# 0 - Specificité de la company 101"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "f8a8dedc-2f67-407c-9bbf-f70d236fc783",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>street_id</th>\n",
" <th>fixed_capacity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>1</td>\n",
" <td>atelier des lumieres</td>\n",
" <td>2020-10-12 08:57:27.783770+02:00</td>\n",
" <td>2020-10-12 08:57:27.783770+02:00</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>14007</td>\n",
" <td>fabrique des lumieres</td>\n",
" <td>2022-05-17 09:11:19.416106+02:00</td>\n",
" <td>2022-05-17 09:11:19.416106+02:00</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>2</td>\n",
" <td>non défini</td>\n",
" <td>2020-10-12 08:57:27.785329+02:00</td>\n",
" <td>2020-10-12 08:57:27.785329+02:00</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>10755</td>\n",
" <td>NaN</td>\n",
" <td>2022-01-28 12:07:16.602885+01:00</td>\n",
" <td>2022-01-28 12:07:16.602885+01:00</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>13583</td>\n",
" <td>hôtel de caumont</td>\n",
" <td>2022-05-13 10:59:06.829576+02:00</td>\n",
" <td>2022-05-13 10:59:06.829576+02:00</td>\n",
" <td>859</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>16422</td>\n",
" <td>atelier des lumières - cézanne</td>\n",
" <td>2022-08-04 04:03:31.045648+02:00</td>\n",
" <td>2022-08-04 04:03:31.045648+02:00</td>\n",
" <td>859</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>21098</td>\n",
" <td>bassins des lumières - 2022 - venise</td>\n",
" <td>2023-04-08 03:49:46.916777+02:00</td>\n",
" <td>2023-04-08 03:49:46.916777+02:00</td>\n",
" <td>859</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>23460</td>\n",
" <td>immersive box</td>\n",
" <td>2023-08-29 17:39:55.188028+02:00</td>\n",
" <td>2023-08-29 17:39:55.188028+02:00</td>\n",
" <td>859</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>13584</td>\n",
" <td>bassins des lumières - venise</td>\n",
" <td>2022-05-13 11:00:14.943669+02:00</td>\n",
" <td>2022-05-13 11:00:14.943669+02:00</td>\n",
" <td>859</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>21096</td>\n",
" <td>atelier des lumières - 2022 - cézanne</td>\n",
" <td>2023-04-08 03:42:10.395124+02:00</td>\n",
" <td>2023-04-08 03:42:10.395124+02:00</td>\n",
" <td>859</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>260</td>\n",
" <td>musée jacquemart andré</td>\n",
" <td>2020-10-18 01:20:12.738229+02:00</td>\n",
" <td>2020-10-18 01:20:12.738229+02:00</td>\n",
" <td>3525</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>71</td>\n",
" <td>cité de l'automobile</td>\n",
" <td>2020-10-13 11:05:43.705639+02:00</td>\n",
" <td>2020-12-03 08:33:15.576065+01:00</td>\n",
" <td>449992</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>89</td>\n",
" <td>bassins de lumieres</td>\n",
" <td>2020-10-13 14:56:27.206958+02:00</td>\n",
" <td>2020-10-13 14:56:27.206958+02:00</td>\n",
" <td>460754</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>108</td>\n",
" <td>les baux de provence</td>\n",
" <td>2020-10-14 14:16:20.284658+02:00</td>\n",
" <td>2020-10-14 14:16:20.284658+02:00</td>\n",
" <td>481475</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>161</td>\n",
" <td>les carrières de lumières</td>\n",
" <td>2020-10-14 18:06:57.059828+02:00</td>\n",
" <td>2020-10-14 18:06:57.059828+02:00</td>\n",
" <td>483815</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>118</td>\n",
" <td>villa ephrussi de rothschild</td>\n",
" <td>2020-10-14 15:02:40.478501+02:00</td>\n",
" <td>2020-10-14 15:02:40.478501+02:00</td>\n",
" <td>485539</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>128</td>\n",
" <td>théâtre antique orange</td>\n",
" <td>2020-10-14 15:46:44.072307+02:00</td>\n",
" <td>2020-10-14 15:46:44.072307+02:00</td>\n",
" <td>499380</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>3875</td>\n",
" <td>carrieres de lumieres</td>\n",
" <td>2021-06-11 10:52:15.706030+02:00</td>\n",
" <td>2021-06-11 10:52:15.706030+02:00</td>\n",
" <td>535931</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>3866</td>\n",
" <td>baux-de-provence</td>\n",
" <td>2021-06-11 10:28:30.237144+02:00</td>\n",
" <td>2021-06-11 10:28:30.237144+02:00</td>\n",
" <td>569179</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>392</td>\n",
" <td>tour magne de nîmes</td>\n",
" <td>2020-10-19 17:51:45.915572+02:00</td>\n",
" <td>2020-10-19 17:51:45.915572+02:00</td>\n",
" <td>717981</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>263</td>\n",
" <td>musée maillol</td>\n",
" <td>2020-10-18 01:30:23.853673+02:00</td>\n",
" <td>2020-10-18 01:30:23.853673+02:00</td>\n",
" <td>852301</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>264</td>\n",
" <td>cinéma d'aigues mortes</td>\n",
" <td>2020-10-18 01:30:23.863631+02:00</td>\n",
" <td>2020-10-18 01:30:23.863631+02:00</td>\n",
" <td>852302</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>388</td>\n",
" <td>maison carrée de nîmes</td>\n",
" <td>2020-10-19 17:37:09.345955+02:00</td>\n",
" <td>2020-10-19 17:37:09.345955+02:00</td>\n",
" <td>867431</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>333</td>\n",
" <td>les arènes de nîmes</td>\n",
" <td>2020-10-19 10:17:55.757817+02:00</td>\n",
" <td>2020-10-19 10:17:55.757817+02:00</td>\n",
" <td>867431</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>170</td>\n",
" <td>caumont centre d'art</td>\n",
" <td>2020-10-14 19:13:55.213186+02:00</td>\n",
" <td>2022-10-14 06:21:53.310810+02:00</td>\n",
" <td>887751</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1665</td>\n",
" <td>cité de l'auto</td>\n",
" <td>2020-12-08 18:46:15.957997+01:00</td>\n",
" <td>2020-12-08 18:46:15.957997+01:00</td>\n",
" <td>1418086</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>11836</td>\n",
" <td>phoenix des lumières</td>\n",
" <td>2022-03-08 16:30:03.135537+01:00</td>\n",
" <td>2022-03-08 16:30:03.135537+01:00</td>\n",
" <td>3639035</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13501</td>\n",
" <td>château de boutemont</td>\n",
" <td>2022-05-10 14:56:36.025562+02:00</td>\n",
" <td>2022-05-10 14:56:36.025562+02:00</td>\n",
" <td>4209418</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13502</td>\n",
" <td>fabrique des lumières</td>\n",
" <td>2022-05-10 15:05:40.443121+02:00</td>\n",
" <td>2022-05-10 15:05:40.443121+02:00</td>\n",
" <td>4209419</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>22219</td>\n",
" <td>immersive box belgique</td>\n",
" <td>2023-06-13 16:17:37.818103+02:00</td>\n",
" <td>2023-06-13 16:17:37.818103+02:00</td>\n",
" <td>7335205</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>22512</td>\n",
" <td>hall des lumières</td>\n",
" <td>2023-06-29 09:31:23.575220+02:00</td>\n",
" <td>2023-06-29 09:31:23.575220+02:00</td>\n",
" <td>7364467</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>22348</td>\n",
" <td>hdl</td>\n",
" <td>2023-06-20 17:58:19.153019+02:00</td>\n",
" <td>2023-06-29 09:38:51.592547+02:00</td>\n",
" <td>7364467</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>22516</td>\n",
" <td>hall des lumieres</td>\n",
" <td>2023-06-29 09:46:44.718839+02:00</td>\n",
" <td>2023-06-29 09:46:44.718839+02:00</td>\n",
" <td>7364467</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>11835</td>\n",
" <td>hdl - ny</td>\n",
" <td>2022-03-08 16:00:20.821212+01:00</td>\n",
" <td>2023-06-29 09:27:59.256591+02:00</td>\n",
" <td>7446203</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name \\\n",
"26 1 atelier des lumieres \n",
"17 14007 fabrique des lumieres \n",
"32 2 non défini \n",
"10 10755 NaN \n",
"16 13583 hôtel de caumont \n",
"2 16422 atelier des lumières - cézanne \n",
"20 21098 bassins des lumières - 2022 - venise \n",
"18 23460 immersive box \n",
"8 13584 bassins des lumières - venise \n",
"15 21096 atelier des lumières - 2022 - cézanne \n",
"27 260 musée jacquemart andré \n",
"33 71 cité de l'automobile \n",
"30 89 bassins de lumieres \n",
"7 108 les baux de provence \n",
"19 161 les carrières de lumières \n",
"24 118 villa ephrussi de rothschild \n",
"29 128 théâtre antique orange \n",
"28 3875 carrieres de lumieres \n",
"25 3866 baux-de-provence \n",
"22 392 tour magne de nîmes \n",
"3 263 musée maillol \n",
"6 264 cinéma d'aigues mortes \n",
"21 388 maison carrée de nîmes \n",
"23 333 les arènes de nîmes \n",
"31 170 caumont centre d'art \n",
"5 1665 cité de l'auto \n",
"14 11836 phoenix des lumières \n",
"1 13501 château de boutemont \n",
"4 13502 fabrique des lumières \n",
"12 22219 immersive box belgique \n",
"13 22512 hall des lumières \n",
"11 22348 hdl \n",
"0 22516 hall des lumieres \n",
"9 11835 hdl - ny \n",
"\n",
" created_at updated_at \\\n",
"26 2020-10-12 08:57:27.783770+02:00 2020-10-12 08:57:27.783770+02:00 \n",
"17 2022-05-17 09:11:19.416106+02:00 2022-05-17 09:11:19.416106+02:00 \n",
"32 2020-10-12 08:57:27.785329+02:00 2020-10-12 08:57:27.785329+02:00 \n",
"10 2022-01-28 12:07:16.602885+01:00 2022-01-28 12:07:16.602885+01:00 \n",
"16 2022-05-13 10:59:06.829576+02:00 2022-05-13 10:59:06.829576+02:00 \n",
"2 2022-08-04 04:03:31.045648+02:00 2022-08-04 04:03:31.045648+02:00 \n",
"20 2023-04-08 03:49:46.916777+02:00 2023-04-08 03:49:46.916777+02:00 \n",
"18 2023-08-29 17:39:55.188028+02:00 2023-08-29 17:39:55.188028+02:00 \n",
"8 2022-05-13 11:00:14.943669+02:00 2022-05-13 11:00:14.943669+02:00 \n",
"15 2023-04-08 03:42:10.395124+02:00 2023-04-08 03:42:10.395124+02:00 \n",
"27 2020-10-18 01:20:12.738229+02:00 2020-10-18 01:20:12.738229+02:00 \n",
"33 2020-10-13 11:05:43.705639+02:00 2020-12-03 08:33:15.576065+01:00 \n",
"30 2020-10-13 14:56:27.206958+02:00 2020-10-13 14:56:27.206958+02:00 \n",
"7 2020-10-14 14:16:20.284658+02:00 2020-10-14 14:16:20.284658+02:00 \n",
"19 2020-10-14 18:06:57.059828+02:00 2020-10-14 18:06:57.059828+02:00 \n",
"24 2020-10-14 15:02:40.478501+02:00 2020-10-14 15:02:40.478501+02:00 \n",
"29 2020-10-14 15:46:44.072307+02:00 2020-10-14 15:46:44.072307+02:00 \n",
"28 2021-06-11 10:52:15.706030+02:00 2021-06-11 10:52:15.706030+02:00 \n",
"25 2021-06-11 10:28:30.237144+02:00 2021-06-11 10:28:30.237144+02:00 \n",
"22 2020-10-19 17:51:45.915572+02:00 2020-10-19 17:51:45.915572+02:00 \n",
"3 2020-10-18 01:30:23.853673+02:00 2020-10-18 01:30:23.853673+02:00 \n",
"6 2020-10-18 01:30:23.863631+02:00 2020-10-18 01:30:23.863631+02:00 \n",
"21 2020-10-19 17:37:09.345955+02:00 2020-10-19 17:37:09.345955+02:00 \n",
"23 2020-10-19 10:17:55.757817+02:00 2020-10-19 10:17:55.757817+02:00 \n",
"31 2020-10-14 19:13:55.213186+02:00 2022-10-14 06:21:53.310810+02:00 \n",
"5 2020-12-08 18:46:15.957997+01:00 2020-12-08 18:46:15.957997+01:00 \n",
"14 2022-03-08 16:30:03.135537+01:00 2022-03-08 16:30:03.135537+01:00 \n",
"1 2022-05-10 14:56:36.025562+02:00 2022-05-10 14:56:36.025562+02:00 \n",
"4 2022-05-10 15:05:40.443121+02:00 2022-05-10 15:05:40.443121+02:00 \n",
"12 2023-06-13 16:17:37.818103+02:00 2023-06-13 16:17:37.818103+02:00 \n",
"13 2023-06-29 09:31:23.575220+02:00 2023-06-29 09:31:23.575220+02:00 \n",
"11 2023-06-20 17:58:19.153019+02:00 2023-06-29 09:38:51.592547+02:00 \n",
"0 2023-06-29 09:46:44.718839+02:00 2023-06-29 09:46:44.718839+02:00 \n",
"9 2022-03-08 16:00:20.821212+01:00 2023-06-29 09:27:59.256591+02:00 \n",
"\n",
" street_id fixed_capacity \n",
"26 1 NaN \n",
"17 2 NaN \n",
"32 2 NaN \n",
"10 2 NaN \n",
"16 859 NaN \n",
"2 859 NaN \n",
"20 859 NaN \n",
"18 859 NaN \n",
"8 859 NaN \n",
"15 859 NaN \n",
"27 3525 NaN \n",
"33 449992 NaN \n",
"30 460754 NaN \n",
"7 481475 NaN \n",
"19 483815 NaN \n",
"24 485539 NaN \n",
"29 499380 NaN \n",
"28 535931 NaN \n",
"25 569179 NaN \n",
"22 717981 NaN \n",
"3 852301 NaN \n",
"6 852302 NaN \n",
"21 867431 NaN \n",
"23 867431 NaN \n",
"31 887751 NaN \n",
"5 1418086 NaN \n",
"14 3639035 NaN \n",
"1 4209418 NaN \n",
"4 4209419 NaN \n",
"12 7335205 NaN \n",
"13 7364467 NaN \n",
"11 7364467 NaN \n",
"0 7364467 NaN \n",
"9 7446203 NaN "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"company_number = '101'\n",
"\n",
"facilities = load_dataset_2(company_number, \"facilities\")\n",
"\n",
"facilities.sort_values(by = 'street_id')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "c8c8eea4-21a2-487b-b20a-15d73616a253",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id_x</th>\n",
" <th>sent_at</th>\n",
" <th>software</th>\n",
" <th>satisfaction</th>\n",
" <th>extra_field</th>\n",
" <th>customer_id</th>\n",
" <th>contribution_site_id</th>\n",
" <th>created_at_x</th>\n",
" <th>updated_at_x</th>\n",
" <th>id_y</th>\n",
" <th>facility_id</th>\n",
" <th>created_at_y</th>\n",
" <th>updated_at_y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>134910</td>\n",
" <td>2017-07-30 15:50:15+02:00</td>\n",
" <td>NaN</td>\n",
" <td>8.0</td>\n",
" <td>NaN</td>\n",
" <td>91936</td>\n",
" <td>70</td>\n",
" <td>2020-09-25 20:41:07.752795+02:00</td>\n",
" <td>2020-09-25 20:41:07.752795+02:00</td>\n",
" <td>70</td>\n",
" <td>438</td>\n",
" <td>2020-09-25 20:41:07.735280+02:00</td>\n",
" <td>2020-09-25 20:41:07.735280+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5848427</td>\n",
" <td>2020-03-04 16:18:13.597000+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>224450</td>\n",
" <td>3420</td>\n",
" <td>2022-01-21 02:44:34.857144+01:00</td>\n",
" <td>2022-01-21 02:44:34.857144+01:00</td>\n",
" <td>3420</td>\n",
" <td>6650</td>\n",
" <td>2022-01-21 02:44:34.690938+01:00</td>\n",
" <td>2022-01-21 02:44:34.690938+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>918383</td>\n",
" <td>2020-10-24 14:59:22.784000+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>397718</td>\n",
" <td>208</td>\n",
" <td>2020-10-25 02:06:54.048105+02:00</td>\n",
" <td>2020-10-25 02:06:54.048105+02:00</td>\n",
" <td>208</td>\n",
" <td>576</td>\n",
" <td>2020-09-27 18:05:14.671650+02:00</td>\n",
" <td>2020-09-27 18:05:14.671650+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>918384</td>\n",
" <td>2020-10-24 14:35:39.725000+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>397719</td>\n",
" <td>208</td>\n",
" <td>2020-10-25 02:06:54.050218+02:00</td>\n",
" <td>2020-10-25 02:06:54.050218+02:00</td>\n",
" <td>208</td>\n",
" <td>576</td>\n",
" <td>2020-09-27 18:05:14.671650+02:00</td>\n",
" <td>2020-09-27 18:05:14.671650+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>918385</td>\n",
" <td>2020-10-24 12:45:35.225000+02:00</td>\n",
" <td>NaN</td>\n",
" <td>10.0</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>208</td>\n",
" <td>2020-10-25 02:06:54.052201+02:00</td>\n",
" <td>2020-10-25 02:06:54.052201+02:00</td>\n",
" <td>208</td>\n",
" <td>576</td>\n",
" <td>2020-09-27 18:05:14.671650+02:00</td>\n",
" <td>2020-09-27 18:05:14.671650+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25454</th>\n",
" <td>1951</td>\n",
" <td>2018-03-20 09:34:09+01:00</td>\n",
" <td>NaN</td>\n",
" <td>8.0</td>\n",
" <td>NaN</td>\n",
" <td>69694</td>\n",
" <td>1</td>\n",
" <td>2020-09-25 20:06:37.138272+02:00</td>\n",
" <td>2020-09-25 20:06:37.138272+02:00</td>\n",
" <td>1</td>\n",
" <td>369</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25455</th>\n",
" <td>1952</td>\n",
" <td>2018-03-20 09:31:56+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>69694</td>\n",
" <td>1</td>\n",
" <td>2020-09-25 20:06:37.138874+02:00</td>\n",
" <td>2020-09-25 20:06:37.138874+02:00</td>\n",
" <td>1</td>\n",
" <td>369</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25456</th>\n",
" <td>1954</td>\n",
" <td>2018-03-20 09:30:44+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>69694</td>\n",
" <td>1</td>\n",
" <td>2020-09-25 20:06:37.140372+02:00</td>\n",
" <td>2020-09-25 20:06:37.140372+02:00</td>\n",
" <td>1</td>\n",
" <td>369</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25457</th>\n",
" <td>1955</td>\n",
" <td>2018-03-20 09:28:49+01:00</td>\n",
" <td>NaN</td>\n",
" <td>8.0</td>\n",
" <td>NaN</td>\n",
" <td>69695</td>\n",
" <td>1</td>\n",
" <td>2020-09-25 20:06:37.140966+02:00</td>\n",
" <td>2020-09-25 20:06:37.140966+02:00</td>\n",
" <td>1</td>\n",
" <td>369</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25458</th>\n",
" <td>1953</td>\n",
" <td>2018-03-20 09:31:23.361000+01:00</td>\n",
" <td>NaN</td>\n",
" <td>8.0</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>2020-09-25 20:06:37.139437+02:00</td>\n",
" <td>2020-09-25 20:06:37.139437+02:00</td>\n",
" <td>1</td>\n",
" <td>369</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" <td>2020-09-25 20:06:35.964342+02:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>25459 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" id_x sent_at software satisfaction \\\n",
"0 134910 2017-07-30 15:50:15+02:00 NaN 8.0 \n",
"1 5848427 2020-03-04 16:18:13.597000+01:00 NaN NaN \n",
"2 918383 2020-10-24 14:59:22.784000+02:00 NaN NaN \n",
"3 918384 2020-10-24 14:35:39.725000+02:00 NaN NaN \n",
"4 918385 2020-10-24 12:45:35.225000+02:00 NaN 10.0 \n",
"... ... ... ... ... \n",
"25454 1951 2018-03-20 09:34:09+01:00 NaN 8.0 \n",
"25455 1952 2018-03-20 09:31:56+01:00 NaN NaN \n",
"25456 1954 2018-03-20 09:30:44+01:00 NaN NaN \n",
"25457 1955 2018-03-20 09:28:49+01:00 NaN 8.0 \n",
"25458 1953 2018-03-20 09:31:23.361000+01:00 NaN 8.0 \n",
"\n",
" extra_field customer_id contribution_site_id \\\n",
"0 NaN 91936 70 \n",
"1 NaN 224450 3420 \n",
"2 NaN 397718 208 \n",
"3 NaN 397719 208 \n",
"4 NaN 2 208 \n",
"... ... ... ... \n",
"25454 NaN 69694 1 \n",
"25455 NaN 69694 1 \n",
"25456 NaN 69694 1 \n",
"25457 NaN 69695 1 \n",
"25458 NaN 2 1 \n",
"\n",
" created_at_x updated_at_x \\\n",
"0 2020-09-25 20:41:07.752795+02:00 2020-09-25 20:41:07.752795+02:00 \n",
"1 2022-01-21 02:44:34.857144+01:00 2022-01-21 02:44:34.857144+01:00 \n",
"2 2020-10-25 02:06:54.048105+02:00 2020-10-25 02:06:54.048105+02:00 \n",
"3 2020-10-25 02:06:54.050218+02:00 2020-10-25 02:06:54.050218+02:00 \n",
"4 2020-10-25 02:06:54.052201+02:00 2020-10-25 02:06:54.052201+02:00 \n",
"... ... ... \n",
"25454 2020-09-25 20:06:37.138272+02:00 2020-09-25 20:06:37.138272+02:00 \n",
"25455 2020-09-25 20:06:37.138874+02:00 2020-09-25 20:06:37.138874+02:00 \n",
"25456 2020-09-25 20:06:37.140372+02:00 2020-09-25 20:06:37.140372+02:00 \n",
"25457 2020-09-25 20:06:37.140966+02:00 2020-09-25 20:06:37.140966+02:00 \n",
"25458 2020-09-25 20:06:37.139437+02:00 2020-09-25 20:06:37.139437+02:00 \n",
"\n",
" id_y facility_id created_at_y \\\n",
"0 70 438 2020-09-25 20:41:07.735280+02:00 \n",
"1 3420 6650 2022-01-21 02:44:34.690938+01:00 \n",
"2 208 576 2020-09-27 18:05:14.671650+02:00 \n",
"3 208 576 2020-09-27 18:05:14.671650+02:00 \n",
"4 208 576 2020-09-27 18:05:14.671650+02:00 \n",
"... ... ... ... \n",
"25454 1 369 2020-09-25 20:06:35.964342+02:00 \n",
"25455 1 369 2020-09-25 20:06:35.964342+02:00 \n",
"25456 1 369 2020-09-25 20:06:35.964342+02:00 \n",
"25457 1 369 2020-09-25 20:06:35.964342+02:00 \n",
"25458 1 369 2020-09-25 20:06:35.964342+02:00 \n",
"\n",
" updated_at_y \n",
"0 2020-09-25 20:41:07.735280+02:00 \n",
"1 2022-01-21 02:44:34.690938+01:00 \n",
"2 2020-09-27 18:05:14.671650+02:00 \n",
"3 2020-09-27 18:05:14.671650+02:00 \n",
"4 2020-09-27 18:05:14.671650+02:00 \n",
"... ... \n",
"25454 2020-09-25 20:06:35.964342+02:00 \n",
"25455 2020-09-25 20:06:35.964342+02:00 \n",
"25456 2020-09-25 20:06:35.964342+02:00 \n",
"25457 2020-09-25 20:06:35.964342+02:00 \n",
"25458 2020-09-25 20:06:35.964342+02:00 \n",
"\n",
"[25459 rows x 13 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# contribution and contribution sites \n",
"contributions = load_dataset_2(company_number, \"contributions\")\n",
"contribution_sites = load_dataset_2(company_number, \"contribution_sites\")\n",
"\n",
"pd.merge(contributions, contribution_sites, left_on = 'contribution_site_id', right_on = 'id', how = 'inner')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "85b70219-f753-422e-9f57-a26eb28e7481",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"id 0.000000\n",
"sent_at 0.000000\n",
"software 1.000000\n",
"satisfaction 0.430732\n",
"extra_field 1.000000\n",
"customer_id 0.000000\n",
"contribution_site_id 0.000000\n",
"created_at 0.000000\n",
"updated_at 0.000000\n",
"dtype: float64"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contributions.isna().sum()/len(contributions)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "647920c8-da07-4e87-964b-304fd7ff79f5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>eur</td>\n",
" <td>2023-07-17 15:35:19.957203+02:00</td>\n",
" <td>2023-07-17 15:35:19.957203+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>usd</td>\n",
" <td>2023-07-17 15:35:21.132408+02:00</td>\n",
" <td>2023-07-17 15:35:21.132408+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>gbp</td>\n",
" <td>2023-07-17 15:35:21.843594+02:00</td>\n",
" <td>2023-07-17 15:35:21.843594+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>chf</td>\n",
" <td>2023-07-17 15:35:23.229322+02:00</td>\n",
" <td>2023-07-17 15:35:23.229322+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>cad</td>\n",
" <td>2023-07-17 15:35:24.262466+02:00</td>\n",
" <td>2023-07-17 15:35:24.262466+02:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name created_at updated_at\n",
"0 1 eur 2023-07-17 15:35:19.957203+02:00 2023-07-17 15:35:19.957203+02:00\n",
"1 2 usd 2023-07-17 15:35:21.132408+02:00 2023-07-17 15:35:21.132408+02:00\n",
"2 3 gbp 2023-07-17 15:35:21.843594+02:00 2023-07-17 15:35:21.843594+02:00\n",
"3 4 chf 2023-07-17 15:35:23.229322+02:00 2023-07-17 15:35:23.229322+02:00\n",
"4 5 cad 2023-07-17 15:35:24.262466+02:00 2023-07-17 15:35:24.262466+02:00"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"company_number = \"2\"\n",
"\n",
"load_dataset_2(company_number, \"currencies\")"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "bc1f3d28-7f0c-4e87-baf7-dddcf03a7145",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>percent_price</th>\n",
" <th>max_price</th>\n",
" <th>min_price</th>\n",
" <th>category_id</th>\n",
" <th>pricing_formula_id</th>\n",
" <th>representation_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2023-10-13 13:02:32.517137+02:00</td>\n",
" <td>2023-10-13 13:02:32.517137+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2023-10-13 13:02:32.531505+02:00</td>\n",
" <td>2023-10-13 13:02:32.531505+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>2023-10-13 13:02:32.532172+02:00</td>\n",
" <td>2023-10-13 13:02:32.532172+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>2023-10-13 13:02:32.532665+02:00</td>\n",
" <td>2023-10-13 13:02:32.532665+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>2023-10-13 13:02:32.533142+02:00</td>\n",
" <td>2023-10-13 13:02:32.533142+02:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>779980</th>\n",
" <td>810312</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>567254</td>\n",
" <td>2023-11-09 05:14:16.770130+01:00</td>\n",
" <td>2023-11-09 05:14:16.770130+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>779981</th>\n",
" <td>810313</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>567254</td>\n",
" <td>2023-11-09 05:14:16.770538+01:00</td>\n",
" <td>2023-11-09 05:14:16.770538+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>779982</th>\n",
" <td>810314</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>567255</td>\n",
" <td>2023-11-09 05:14:16.770916+01:00</td>\n",
" <td>2023-11-09 05:14:16.770916+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>779983</th>\n",
" <td>810315</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>567256</td>\n",
" <td>2023-11-09 05:14:16.771359+01:00</td>\n",
" <td>2023-11-09 05:14:16.771359+01:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>779984</th>\n",
" <td>810316</td>\n",
" <td>100.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>567257</td>\n",
" <td>2023-11-09 05:14:16.771761+01:00</td>\n",
" <td>2023-11-09 05:14:16.771761+01:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>779985 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" id percent_price max_price min_price category_id \\\n",
"0 1 100.0 0.0 0.0 1 \n",
"1 2 100.0 0.0 0.0 1 \n",
"2 3 100.0 0.0 0.0 1 \n",
"3 4 100.0 0.0 0.0 1 \n",
"4 5 100.0 0.0 0.0 1 \n",
"... ... ... ... ... ... \n",
"779980 810312 100.0 0.0 0.0 1 \n",
"779981 810313 100.0 0.0 0.0 1 \n",
"779982 810314 100.0 0.0 0.0 1 \n",
"779983 810315 100.0 0.0 0.0 1 \n",
"779984 810316 100.0 0.0 0.0 1 \n",
"\n",
" pricing_formula_id representation_id \\\n",
"0 1 1 \n",
"1 1 2 \n",
"2 1 3 \n",
"3 1 4 \n",
"4 1 5 \n",
"... ... ... \n",
"779980 1 567254 \n",
"779981 4 567254 \n",
"779982 1 567255 \n",
"779983 1 567256 \n",
"779984 1 567257 \n",
"\n",
" created_at updated_at \n",
"0 2023-10-13 13:02:32.517137+02:00 2023-10-13 13:02:32.517137+02:00 \n",
"1 2023-10-13 13:02:32.531505+02:00 2023-10-13 13:02:32.531505+02:00 \n",
"2 2023-10-13 13:02:32.532172+02:00 2023-10-13 13:02:32.532172+02:00 \n",
"3 2023-10-13 13:02:32.532665+02:00 2023-10-13 13:02:32.532665+02:00 \n",
"4 2023-10-13 13:02:32.533142+02:00 2023-10-13 13:02:32.533142+02:00 \n",
"... ... ... \n",
"779980 2023-11-09 05:14:16.770130+01:00 2023-11-09 05:14:16.770130+01:00 \n",
"779981 2023-11-09 05:14:16.770538+01:00 2023-11-09 05:14:16.770538+01:00 \n",
"779982 2023-11-09 05:14:16.770916+01:00 2023-11-09 05:14:16.770916+01:00 \n",
"779983 2023-11-09 05:14:16.771359+01:00 2023-11-09 05:14:16.771359+01:00 \n",
"779984 2023-11-09 05:14:16.771761+01:00 2023-11-09 05:14:16.771761+01:00 \n",
"\n",
"[779985 rows x 9 columns]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"load_dataset_2(company_number, \"products_groups\")"
]
},
{
"cell_type": "markdown",
"id": "45d5261f-4d46-49cb-8582-dd2121122b05",
"metadata": {},
"source": [
"# 1 - Comportement d'achat"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "8917cc1b-4728-460c-8432-a633de7f039b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_1/products_purchased_reduced.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_2/products_purchased_reduced.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_3/products_purchased_reduced.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_4/products_purchased_reduced.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
"<string>:13: DtypeWarning: Columns (12) have mixed types. Specify dtype option on import or set low_memory=False.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_101/products_purchased_reduced.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_101/products_purchased_reduced_1.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"<string>:13: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n"
]
}
],
"source": [
"for company_number in ['1', '2', '3', '4', '101'] :\n",
" nom_dataframe = 'df'+ company_number +'_tickets'\n",
" globals()[nom_dataframe] = display_databases(company_number, file_name = 'products_purchased_reduced' , datetime_col = ['purchase_date'])\n",
"\n",
" if company_number == \"101\" :\n",
" df101_tickets_1 = display_databases(company_number, file_name = 'products_purchased_reduced_1' , datetime_col = ['purchase_date'])\n",
"\n",
" "
]
},
{
"cell_type": "markdown",
"id": "3479960c-0d23-45f1-8fff-d87395205731",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Outlier"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "9376af51-4320-44b6-8f30-1e1234371556",
"metadata": {},
"outputs": [],
"source": [
"def outlier_detection(directory_path = \"1\", coupure = 1):\n",
" nom_dataframe = 'df'+ directory_path +'_tickets'\n",
" df_tickets = globals()[nom_dataframe].copy()\n",
" df_tickets_kpi = tickets_kpi_function(df_tickets)\n",
"\n",
" if directory_path == \"101\" :\n",
" df_tickets_1 = df101_tickets_1.copy()\n",
" df_tickets_kpi_1 = tickets_kpi_function(df_tickets_1)\n",
"\n",
" df_tickets_kpi = pd.concat([df_tickets_kpi, df_tickets_kpi_1])\n",
" # Part du CA par customer\n",
" total_amount_share = df_tickets_kpi.groupby('customer_id')['total_amount'].sum().reset_index()\n",
" total_amount_share['total_amount_entreprise'] = total_amount_share['total_amount'].sum()\n",
" total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['total_amount_entreprise']\n",
" \n",
" total_amount_share_index = total_amount_share.set_index('customer_id')\n",
" df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)\n",
" \n",
" top = df_circulaire[:coupure]\n",
" rest = df_circulaire[coupure:]\n",
" \n",
" # Calculez la somme du reste\n",
" rest_sum = rest.sum()\n",
" \n",
" # Créez une nouvelle série avec les cinq plus grandes parts et 'Autre'\n",
" new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])\n",
" \n",
" # Créez le graphique circulaire\n",
" plt.figure(figsize=(3, 3))\n",
" plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)\n",
" plt.axis('equal') # Assurez-vous que le graphique est un cercle\n",
" plt.title('Répartition des montants totaux')\n",
" plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "73211efc-b79f-4235-a250-c0699ea277bf",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 300x300 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"outlier_detection(directory_path = \"1\", coupure = 2)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "5c8e9bb7-a403-4898-b40b-47aa37237bc6",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>extra</th>\n",
" <th>deleted_at</th>\n",
" <th>reference</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>extra_field</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>note</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>need_reload</th>\n",
" <th>last_buying_date</th>\n",
" <th>max_price</th>\n",
" <th>ticket_sum</th>\n",
" <th>average_price</th>\n",
" <th>fidelity</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>preferred_category</th>\n",
" <th>preferred_supplier</th>\n",
" <th>preferred_formula</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>last_visiting_date</th>\n",
" <th>zipcode</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>58201</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>2020-09-03 13:11:25.569167+02:00</td>\n",
" <td>2023-03-04 13:27:42.761679+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>2023-11-08 03:20:07</td>\n",
" <td>45.0</td>\n",
" <td>1254775</td>\n",
" <td>7.030122</td>\n",
" <td>330831</td>\n",
" <td>-67.790969</td>\n",
" <td>13.75153</td>\n",
" <td>1.956087</td>\n",
" <td>8821221.5</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>641472</td>\n",
" <td>2013-06-10 12:37:58+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email street_id \\\n",
"58201 1 NaN NaN NaN NaN 2 \n",
"\n",
" created_at updated_at \\\n",
"58201 2020-09-03 13:11:25.569167+02:00 2023-03-04 13:27:42.761679+01:00 \n",
"\n",
" civility is_partner extra deleted_at reference gender \\\n",
"58201 NaN False NaN NaN NaN 2 \n",
"\n",
" is_email_true extra_field opt_in structure_id note profession \\\n",
"58201 True NaN False NaN NaN NaN \n",
"\n",
" language mcp_contact_id need_reload last_buying_date max_price \\\n",
"58201 NaN NaN False 2023-11-08 03:20:07 45.0 \n",
"\n",
" ticket_sum average_price fidelity average_purchase_delay \\\n",
"58201 1254775 7.030122 330831 -67.790969 \n",
"\n",
" average_price_basket average_ticket_basket total_price \\\n",
"58201 13.75153 1.956087 8821221.5 \n",
"\n",
" preferred_category preferred_supplier preferred_formula \\\n",
"58201 NaN NaN NaN \n",
"\n",
" purchase_count first_buying_date last_visiting_date zipcode \\\n",
"58201 641472 2013-06-10 12:37:58+02:00 NaN NaN \n",
"\n",
" country age tenant_id \n",
"58201 fr NaN 1311 "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = load_dataset_2('1', 'customersplus')\n",
"df[df['id'] == 1]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "4455b6b9-8395-47ea-b976-d98a2d3c782c",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 300x300 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"outlier_detection(directory_path = \"2\", coupure = 2)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "ee16cf31-18e1-4803-b003-ba1d1a3fc333",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>extra</th>\n",
" <th>deleted_at</th>\n",
" <th>reference</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>extra_field</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>note</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>need_reload</th>\n",
" <th>last_buying_date</th>\n",
" <th>max_price</th>\n",
" <th>ticket_sum</th>\n",
" <th>average_price</th>\n",
" <th>fidelity</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>preferred_category</th>\n",
" <th>preferred_supplier</th>\n",
" <th>preferred_formula</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>last_visiting_date</th>\n",
" <th>zipcode</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>170246</th>\n",
" <td>12184</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3564</td>\n",
" <td>2023-10-12 12:25:15.438714+02:00</td>\n",
" <td>2023-11-09 05:14:01.944407+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>1275.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>2023-11-08 19:17:50.565000</td>\n",
" <td>75.0</td>\n",
" <td>512831</td>\n",
" <td>12.645438</td>\n",
" <td>197358</td>\n",
" <td>0.0</td>\n",
" <td>31.719577</td>\n",
" <td>2.508381</td>\n",
" <td>6484972.4</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>204447</td>\n",
" <td>2020-08-28 08:55:55.710000+02:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email street_id \\\n",
"170246 12184 NaN NaN NaN NaN 3564 \n",
"\n",
" created_at updated_at \\\n",
"170246 2023-10-12 12:25:15.438714+02:00 2023-11-09 05:14:01.944407+01:00 \n",
"\n",
" civility is_partner extra deleted_at reference gender \\\n",
"170246 NaN False NaN NaN NaN 2 \n",
"\n",
" is_email_true extra_field opt_in structure_id note profession \\\n",
"170246 True NaN False 1275.0 NaN NaN \n",
"\n",
" language mcp_contact_id need_reload last_buying_date \\\n",
"170246 NaN NaN False 2023-11-08 19:17:50.565000 \n",
"\n",
" max_price ticket_sum average_price fidelity \\\n",
"170246 75.0 512831 12.645438 197358 \n",
"\n",
" average_purchase_delay average_price_basket average_ticket_basket \\\n",
"170246 0.0 31.719577 2.508381 \n",
"\n",
" total_price preferred_category preferred_supplier \\\n",
"170246 6484972.4 NaN NaN \n",
"\n",
" preferred_formula purchase_count first_buying_date \\\n",
"170246 NaN 204447 2020-08-28 08:55:55.710000+02:00 \n",
"\n",
" last_visiting_date zipcode country age tenant_id \n",
"170246 NaN NaN NaN NaN 1879 "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = load_dataset_2('2', 'customersplus')\n",
"df[df['id'] == 12184]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "4073c986-3e2c-4945-8601-220fea747c9c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>extra</th>\n",
" <th>deleted_at</th>\n",
" <th>reference</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>extra_field</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>note</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>need_reload</th>\n",
" <th>last_buying_date</th>\n",
" <th>max_price</th>\n",
" <th>ticket_sum</th>\n",
" <th>average_price</th>\n",
" <th>fidelity</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>preferred_category</th>\n",
" <th>preferred_supplier</th>\n",
" <th>preferred_formula</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>last_visiting_date</th>\n",
" <th>zipcode</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>102639</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>email1</td>\n",
" <td>1</td>\n",
" <td>2023-07-20 17:16:27.062822+02:00</td>\n",
" <td>2023-07-20 17:16:27.074952+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224453</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>firstname2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>2023-07-21 10:18:44.502496+02:00</td>\n",
" <td>2023-07-21 10:18:44.502496+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>josef</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ch</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103013</th>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>firstname3</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>2023-07-21 10:18:44.503913+02:00</td>\n",
" <td>2023-07-21 10:18:44.503913+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>dominic</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ch</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>138386</th>\n",
" <td>4</td>\n",
" <td>NaN</td>\n",
" <td>firstname4</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>2023-07-21 10:18:44.504404+02:00</td>\n",
" <td>2023-07-21 10:18:44.504404+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>abigail</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ch</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>190087</th>\n",
" <td>5</td>\n",
" <td>NaN</td>\n",
" <td>firstname5</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>2023-07-21 10:18:44.504841+02:00</td>\n",
" <td>2023-07-21 10:18:44.504841+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>sophia</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>ch</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101868</th>\n",
" <td>601387</td>\n",
" <td>lastname601387</td>\n",
" <td>firstname601387</td>\n",
" <td>NaN</td>\n",
" <td>email601387</td>\n",
" <td>3550</td>\n",
" <td>2023-11-09 05:13:57.358715+01:00</td>\n",
" <td>2023-11-09 05:13:57.358715+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>de</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>205168</th>\n",
" <td>601388</td>\n",
" <td>lastname601388</td>\n",
" <td>firstname601388</td>\n",
" <td>NaN</td>\n",
" <td>email601388</td>\n",
" <td>3550</td>\n",
" <td>2023-11-09 05:13:57.359234+01:00</td>\n",
" <td>2023-11-09 05:13:57.359234+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>de</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>2023-11-09 00:25:24.716000</td>\n",
" <td>15.0</td>\n",
" <td>2</td>\n",
" <td>14.0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>28.0</td>\n",
" <td>2.0</td>\n",
" <td>28.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>2023-11-09 00:25:24.716000+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67641</th>\n",
" <td>601389</td>\n",
" <td>lastname601389</td>\n",
" <td>firstname601389</td>\n",
" <td>NaN</td>\n",
" <td>email601389</td>\n",
" <td>3550</td>\n",
" <td>2023-11-09 05:13:57.360373+01:00</td>\n",
" <td>2023-11-09 05:13:57.360373+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>de</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>2023-11-09 00:28:07.511000</td>\n",
" <td>15.0</td>\n",
" <td>2</td>\n",
" <td>15.0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>30.0</td>\n",
" <td>2.0</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>2023-11-09 00:28:07.511000+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67639</th>\n",
" <td>601390</td>\n",
" <td>lastname601390</td>\n",
" <td>firstname601390</td>\n",
" <td>NaN</td>\n",
" <td>email601390</td>\n",
" <td>3550</td>\n",
" <td>2023-11-09 05:13:57.360903+01:00</td>\n",
" <td>2023-11-09 05:13:57.360903+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" <tr>\n",
" <th>256450</th>\n",
" <td>601391</td>\n",
" <td>lastname601391</td>\n",
" <td>firstname601391</td>\n",
" <td>NaN</td>\n",
" <td>email601391</td>\n",
" <td>3550</td>\n",
" <td>2023-11-09 05:13:57.361432+01:00</td>\n",
" <td>2023-11-09 05:14:18.906054+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>2023-11-09 00:36:41.172000</td>\n",
" <td>15.0</td>\n",
" <td>2</td>\n",
" <td>15.0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>30.0</td>\n",
" <td>2.0</td>\n",
" <td>30.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>2023-11-09 00:36:41.172000+01:00</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1879</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>275622 rows × 42 columns</p>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email \\\n",
"102639 1 NaN NaN NaN email1 \n",
"224453 2 NaN firstname2 NaN NaN \n",
"103013 3 NaN firstname3 NaN NaN \n",
"138386 4 NaN firstname4 NaN NaN \n",
"190087 5 NaN firstname5 NaN NaN \n",
"... ... ... ... ... ... \n",
"101868 601387 lastname601387 firstname601387 NaN email601387 \n",
"205168 601388 lastname601388 firstname601388 NaN email601388 \n",
"67641 601389 lastname601389 firstname601389 NaN email601389 \n",
"67639 601390 lastname601390 firstname601390 NaN email601390 \n",
"256450 601391 lastname601391 firstname601391 NaN email601391 \n",
"\n",
" street_id created_at \\\n",
"102639 1 2023-07-20 17:16:27.062822+02:00 \n",
"224453 2 2023-07-21 10:18:44.502496+02:00 \n",
"103013 3 2023-07-21 10:18:44.503913+02:00 \n",
"138386 3 2023-07-21 10:18:44.504404+02:00 \n",
"190087 3 2023-07-21 10:18:44.504841+02:00 \n",
"... ... ... \n",
"101868 3550 2023-11-09 05:13:57.358715+01:00 \n",
"205168 3550 2023-11-09 05:13:57.359234+01:00 \n",
"67641 3550 2023-11-09 05:13:57.360373+01:00 \n",
"67639 3550 2023-11-09 05:13:57.360903+01:00 \n",
"256450 3550 2023-11-09 05:13:57.361432+01:00 \n",
"\n",
" updated_at civility is_partner extra \\\n",
"102639 2023-07-20 17:16:27.074952+02:00 NaN False NaN \n",
"224453 2023-07-21 10:18:44.502496+02:00 NaN False NaN \n",
"103013 2023-07-21 10:18:44.503913+02:00 NaN False NaN \n",
"138386 2023-07-21 10:18:44.504404+02:00 NaN False NaN \n",
"190087 2023-07-21 10:18:44.504841+02:00 NaN False NaN \n",
"... ... ... ... ... \n",
"101868 2023-11-09 05:13:57.358715+01:00 NaN False NaN \n",
"205168 2023-11-09 05:13:57.359234+01:00 NaN False NaN \n",
"67641 2023-11-09 05:13:57.360373+01:00 NaN False NaN \n",
"67639 2023-11-09 05:13:57.360903+01:00 NaN False NaN \n",
"256450 2023-11-09 05:14:18.906054+01:00 NaN False NaN \n",
"\n",
" deleted_at reference gender is_email_true extra_field opt_in \\\n",
"102639 NaN NaN 2 True NaN False \n",
"224453 NaN NaN 1 True NaN False \n",
"103013 NaN NaN 2 True NaN False \n",
"138386 NaN NaN 2 True NaN False \n",
"190087 NaN NaN 1 True NaN False \n",
"... ... ... ... ... ... ... \n",
"101868 NaN NaN 2 True NaN False \n",
"205168 NaN NaN 2 True NaN False \n",
"67641 NaN NaN 2 True NaN False \n",
"67639 NaN NaN 0 True NaN False \n",
"256450 NaN NaN 2 True NaN False \n",
"\n",
" structure_id note profession language mcp_contact_id need_reload \\\n",
"102639 NaN NaN NaN NaN 1.0 False \n",
"224453 NaN NaN NaN josef NaN False \n",
"103013 NaN NaN NaN dominic NaN False \n",
"138386 NaN NaN NaN abigail NaN False \n",
"190087 NaN NaN NaN sophia NaN False \n",
"... ... ... ... ... ... ... \n",
"101868 NaN NaN NaN de NaN False \n",
"205168 NaN NaN NaN de NaN False \n",
"67641 NaN NaN NaN de NaN False \n",
"67639 NaN NaN NaN NaN NaN False \n",
"256450 NaN NaN NaN NaN NaN False \n",
"\n",
" last_buying_date max_price ticket_sum average_price \\\n",
"102639 NaN NaN 0 NaN \n",
"224453 NaN NaN 0 NaN \n",
"103013 NaN NaN 0 NaN \n",
"138386 NaN NaN 0 NaN \n",
"190087 NaN NaN 0 NaN \n",
"... ... ... ... ... \n",
"101868 NaN NaN 0 NaN \n",
"205168 2023-11-09 00:25:24.716000 15.0 2 14.0 \n",
"67641 2023-11-09 00:28:07.511000 15.0 2 15.0 \n",
"67639 NaN NaN 0 NaN \n",
"256450 2023-11-09 00:36:41.172000 15.0 2 15.0 \n",
"\n",
" fidelity average_purchase_delay average_price_basket \\\n",
"102639 0 NaN NaN \n",
"224453 0 NaN NaN \n",
"103013 0 NaN NaN \n",
"138386 0 NaN NaN \n",
"190087 0 NaN NaN \n",
"... ... ... ... \n",
"101868 0 NaN NaN \n",
"205168 1 0.0 28.0 \n",
"67641 1 0.0 30.0 \n",
"67639 0 NaN NaN \n",
"256450 1 0.0 30.0 \n",
"\n",
" average_ticket_basket total_price preferred_category \\\n",
"102639 NaN 0.0 NaN \n",
"224453 NaN 0.0 NaN \n",
"103013 NaN 0.0 NaN \n",
"138386 NaN 0.0 NaN \n",
"190087 NaN 0.0 NaN \n",
"... ... ... ... \n",
"101868 NaN 0.0 NaN \n",
"205168 2.0 28.0 NaN \n",
"67641 2.0 30.0 NaN \n",
"67639 NaN 0.0 NaN \n",
"256450 2.0 30.0 NaN \n",
"\n",
" preferred_supplier preferred_formula purchase_count \\\n",
"102639 NaN NaN 0 \n",
"224453 NaN NaN 0 \n",
"103013 NaN NaN 0 \n",
"138386 NaN NaN 0 \n",
"190087 NaN NaN 0 \n",
"... ... ... ... \n",
"101868 NaN NaN 0 \n",
"205168 NaN NaN 1 \n",
"67641 NaN NaN 1 \n",
"67639 NaN NaN 0 \n",
"256450 NaN NaN 1 \n",
"\n",
" first_buying_date last_visiting_date zipcode country \\\n",
"102639 NaN NaN NaN fr \n",
"224453 NaN NaN NaN ch \n",
"103013 NaN NaN NaN ch \n",
"138386 NaN NaN NaN ch \n",
"190087 NaN NaN NaN ch \n",
"... ... ... ... ... \n",
"101868 NaN NaN NaN NaN \n",
"205168 2023-11-09 00:25:24.716000+01:00 NaN NaN NaN \n",
"67641 2023-11-09 00:28:07.511000+01:00 NaN NaN NaN \n",
"67639 NaN NaN NaN NaN \n",
"256450 2023-11-09 00:36:41.172000+01:00 NaN NaN NaN \n",
"\n",
" age tenant_id \n",
"102639 NaN 1879 \n",
"224453 NaN 1879 \n",
"103013 NaN 1879 \n",
"138386 NaN 1879 \n",
"190087 NaN 1879 \n",
"... ... ... \n",
"101868 NaN 1879 \n",
"205168 NaN 1879 \n",
"67641 NaN 1879 \n",
"67639 NaN 1879 \n",
"256450 NaN 1879 \n",
"\n",
"[275622 rows x 42 columns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(by = 'id')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "475030ad-6a69-4c91-9cd6-943a0edeaf01",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_3/products_purchased_reduced.csv\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 300x300 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"outlier_detection(directory_path = \"3\", coupure = 2)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "b64d04db-1c3f-4538-9d05-8f7d62c7c046",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>extra</th>\n",
" <th>deleted_at</th>\n",
" <th>reference</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>extra_field</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>note</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>need_reload</th>\n",
" <th>last_buying_date</th>\n",
" <th>max_price</th>\n",
" <th>ticket_sum</th>\n",
" <th>average_price</th>\n",
" <th>fidelity</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>preferred_category</th>\n",
" <th>preferred_supplier</th>\n",
" <th>preferred_formula</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>last_visiting_date</th>\n",
" <th>zipcode</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>105720</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1961-12-04</td>\n",
" <td>NaN</td>\n",
" <td>91159</td>\n",
" <td>2021-03-02 15:35:40.452065+01:00</td>\n",
" <td>2023-11-09 01:31:07.539604+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>19715.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>2023-11-06 16:57:19</td>\n",
" <td>7500.0</td>\n",
" <td>2297716</td>\n",
" <td>10.152196</td>\n",
" <td>14917</td>\n",
" <td>-39771.165147</td>\n",
" <td>27.514811</td>\n",
" <td>2.710232</td>\n",
" <td>2.332686e+07</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>847793</td>\n",
" <td>2016-01-01 10:23:36+01:00</td>\n",
" <td>2023-11-06 17:12:00</td>\n",
" <td>13090</td>\n",
" <td>fr</td>\n",
" <td>61.0</td>\n",
" <td>1512</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email street_id \\\n",
"105720 1 NaN NaN 1961-12-04 NaN 91159 \n",
"\n",
" created_at updated_at \\\n",
"105720 2021-03-02 15:35:40.452065+01:00 2023-11-09 01:31:07.539604+01:00 \n",
"\n",
" civility is_partner extra deleted_at reference gender \\\n",
"105720 NaN False NaN NaN NaN 2 \n",
"\n",
" is_email_true extra_field opt_in structure_id note profession \\\n",
"105720 False NaN False 19715.0 NaN NaN \n",
"\n",
" language mcp_contact_id need_reload last_buying_date max_price \\\n",
"105720 NaN NaN False 2023-11-06 16:57:19 7500.0 \n",
"\n",
" ticket_sum average_price fidelity average_purchase_delay \\\n",
"105720 2297716 10.152196 14917 -39771.165147 \n",
"\n",
" average_price_basket average_ticket_basket total_price \\\n",
"105720 27.514811 2.710232 2.332686e+07 \n",
"\n",
" preferred_category preferred_supplier preferred_formula \\\n",
"105720 NaN NaN NaN \n",
"\n",
" purchase_count first_buying_date last_visiting_date \\\n",
"105720 847793 2016-01-01 10:23:36+01:00 2023-11-06 17:12:00 \n",
"\n",
" zipcode country age tenant_id \n",
"105720 13090 fr 61.0 1512 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = load_dataset_2('3', 'customersplus')\n",
"df[df['id'] == 1]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "1d817bee-3ded-4066-9f91-6cf095591b0e",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_4/products_purchased_reduced.csv\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 300x300 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"outlier_detection(directory_path = \"4\", coupure = 2)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "4cc07982-1070-439b-a579-fd3f351778b3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>extra</th>\n",
" <th>deleted_at</th>\n",
" <th>reference</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>extra_field</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>note</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>need_reload</th>\n",
" <th>last_buying_date</th>\n",
" <th>max_price</th>\n",
" <th>ticket_sum</th>\n",
" <th>average_price</th>\n",
" <th>fidelity</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>preferred_category</th>\n",
" <th>preferred_supplier</th>\n",
" <th>preferred_formula</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>last_visiting_date</th>\n",
" <th>zipcode</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>300754</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>2020-09-25 19:09:07.669208+02:00</td>\n",
" <td>2021-11-30 02:07:28.120188+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>2023-11-07 16:33:09</td>\n",
" <td>360.0</td>\n",
" <td>1237224</td>\n",
" <td>6.056248</td>\n",
" <td>236850</td>\n",
" <td>0.015528</td>\n",
" <td>13.493612</td>\n",
" <td>2.228048</td>\n",
" <td>7492935.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>555295</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1342</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email street_id \\\n",
"300754 2 NaN NaN NaN NaN 2 \n",
"\n",
" created_at updated_at \\\n",
"300754 2020-09-25 19:09:07.669208+02:00 2021-11-30 02:07:28.120188+01:00 \n",
"\n",
" civility is_partner extra deleted_at reference gender \\\n",
"300754 NaN False NaN NaN NaN 2 \n",
"\n",
" is_email_true extra_field opt_in structure_id note profession \\\n",
"300754 False NaN False NaN NaN NaN \n",
"\n",
" language mcp_contact_id need_reload last_buying_date max_price \\\n",
"300754 NaN NaN False 2023-11-07 16:33:09 360.0 \n",
"\n",
" ticket_sum average_price fidelity average_purchase_delay \\\n",
"300754 1237224 6.056248 236850 0.015528 \n",
"\n",
" average_price_basket average_ticket_basket total_price \\\n",
"300754 13.493612 2.228048 7492935.0 \n",
"\n",
" preferred_category preferred_supplier preferred_formula \\\n",
"300754 NaN NaN NaN \n",
"\n",
" purchase_count first_buying_date last_visiting_date zipcode \\\n",
"300754 555295 1901-01-01 00:09:21+00:09 NaN NaN \n",
"\n",
" country age tenant_id \n",
"300754 NaN NaN 1342 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = load_dataset_2('4', 'customersplus')\n",
"df[df['id'] == 2]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "f74a9e62-a0f7-41cf-9834-78a99204547c",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 300x300 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"outlier_detection(directory_path = \"101\", coupure = 2)"
]
},
{
"cell_type": "markdown",
"id": "dbebfa92-310a-417b-a7fa-36ac3593db06",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Evolution des commandes"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "06137694-7f50-47ba-8749-68471ececc1e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_448/3643128924.py:11: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
" purchases = pd.read_csv(file_in, sep=\",\", parse_dates = ['purchase_date'], date_parser=custom_date_parser)\n",
"/tmp/ipykernel_448/3643128924.py:19: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
" campaigns = pd.read_csv(file_in, sep=\",\", parse_dates = ['sent_at'], date_parser=custom_date_parser)\n"
]
}
],
"source": [
"# Importation - Chargement des données temporaires\n",
"company_number = \"1\"\n",
"nom_dataframe = 'df'+ company_number +'_tickets'\n",
"purchases = globals()[nom_dataframe].copy()\n",
"\n",
"campaigns = display_databases(company_number,'campaigns_information', ['sent_at'])\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e6b962d4-1a30-4133-ac0f-359f7afef42c",
"metadata": {},
"outputs": [],
"source": [
"# Mois du premier achat\n",
"purchase_min = purchases.groupby(['customer_id'])['purchase_date'].min().reset_index()\n",
"purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)\n",
"purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])\n",
"purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))\n",
"\n",
"# Mois du premier mails\n",
"first_mail_received = campaigns.groupby('customer_id')['sent_at'].min().reset_index()\n",
"first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)\n",
"first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])\n",
"first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))\n",
"\n",
"# Fusion \n",
"known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], \n",
" first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')\n",
"\n",
"# Mois à partir duquel le client est considere comme connu\n",
"known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "9c56e5ac-cbf4-4343-80ba-be2ab8b60eab",
"metadata": {},
"outputs": [],
"source": [
"# Nombre de commande par mois\n",
"purchases_count = pd.merge(purchases[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')\n",
"purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)\n",
"purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))\n",
"purchases_count = purchases_count[purchases_count['customer_id'] != 1]\n",
"\n",
"# Nombre de commande par mois par type de client\n",
"nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()\n",
"nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)\n",
"\n",
"nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()\n",
"nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "8c1aed44-03d3-49f9-b96c-b06a0df03dde",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Graphique en nombre de commande\n",
"purchases_graph = nb_purchases_graph\n",
"\n",
"purchases_graph_used = purchases_graph[purchases_graph[\"purchase_date_month\"] >= datetime(2021,3,1)]\n",
"purchases_graph_used_0 = purchases_graph_used[purchases_graph_used[\"is_customer_known\"]==False]\n",
"purchases_graph_used_1 = purchases_graph_used[purchases_graph_used[\"is_customer_known\"]==True]\n",
"\n",
"\n",
"# Création du barplot\n",
"plt.bar(purchases_graph_used_0[\"purchase_date_month\"], purchases_graph_used_0[\"nb_purchases\"], width=12, label = \"Nouveau client\")\n",
"plt.bar(purchases_graph_used_0[\"purchase_date_month\"], purchases_graph_used_1[\"nb_purchases\"], \n",
" bottom = purchases_graph_used_0[\"nb_purchases\"], width=12, label = \"Ancien client\")\n",
"\n",
"\n",
"# commande pr afficher slt\n",
"plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))\n",
"\n",
"\n",
"# Ajout de titres et d'étiquettes\n",
"plt.xlabel('Mois')\n",
"plt.ylabel(\"Nombre d'achats\")\n",
"plt.title(\"Nombre d'achats - MUCEM\")\n",
"plt.legend()\n",
"\n",
"# Affichage du barplot\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "d312276c-4c46-4d29-b6d6-ed110f59890d",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# graphique en nombre de client ayant commandé\n",
"purchases_graph = nb_purchases_graph_2\n",
"\n",
"purchases_graph_used = purchases_graph[purchases_graph[\"purchase_date_month\"] >= datetime(2021,4,1)]\n",
"purchases_graph_used_0 = purchases_graph_used[purchases_graph_used[\"is_customer_known\"]==False]\n",
"purchases_graph_used_1 = purchases_graph_used[purchases_graph_used[\"is_customer_known\"]==True]\n",
"\n",
"\n",
"# Création du barplot\n",
"plt.bar(purchases_graph_used_0[\"purchase_date_month\"], purchases_graph_used_0[\"nb_new_customer\"], width=12, label = \"Nouveau client\")\n",
"plt.bar(purchases_graph_used_0[\"purchase_date_month\"], purchases_graph_used_1[\"nb_new_customer\"], \n",
" bottom = purchases_graph_used_0[\"nb_new_customer\"], width=12, label = \"Ancien client\")\n",
"\n",
"\n",
"# commande pr afficher slt\n",
"plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))\n",
"\n",
"\n",
"# Ajout de titres et d'étiquettes\n",
"plt.xlabel('Mois')\n",
"plt.ylabel(\"Nombre de client ayant commandé\")\n",
"plt.title(\"Nombre de client ayant commandé un ticket pour l'offre 'muséale groupe'\")\n",
"plt.legend()\n",
"\n",
"# Affichage du barplot\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "82895dfc-e5ca-4be0-af24-93c1be8f6248",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Proportion de tickets de prix 0"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "10828dd8-8ec9-49eb-b450-acca741964c7",
"metadata": {},
"outputs": [],
"source": [
"barplot_prop_free_price = pd.DataFrame()\n",
"for company_number in ['1', '2', '3', '4', '101'] : # \n",
" nom_dataframe = 'df'+ company_number +'_tickets'\n",
" df_tickets = globals()[nom_dataframe].copy()\n",
" df_free_tickets = df_tickets[df_tickets['amount'] == 0 | df_tickets['amount'].isna()]\n",
"\n",
" if company_number == '101' :\n",
" df_free_tickets_1 = df101_tickets_1[df101_tickets_1['amount'] == 0]\n",
" nb_tickets = len(df_tickets) + len(df101_tickets_1)\n",
" nb_free_tickets = len(df_free_tickets) + len(df_free_tickets_1)\n",
" \n",
" graph_dataframe = pd.DataFrame({'company_number' : [company_number], \n",
" 'prop_free_tickets' : [nb_free_tickets / nb_tickets],\n",
" 'nb_tickets' : [nb_tickets]})\n",
" \n",
" else : \n",
" graph_dataframe = pd.DataFrame({'company_number' : [company_number], \n",
" 'prop_free_tickets' : [len(df_free_tickets) / len(df_tickets)],\n",
" 'nb_tickets' : [len(df_tickets)]})\n",
"\n",
" barplot_prop_free_price = pd.concat([barplot_prop_free_price, graph_dataframe])"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "065576ef-2515-43eb-a65d-21f07f228c9e",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"barplot_prop_free_price\n",
"\n",
"df = barplot_prop_free_price.sort_values( by = 'prop_free_tickets')\n",
"\n",
"# Création du barplot\n",
"plt.figure(figsize=(10, 6))\n",
"plt.bar(df['company_number'], df['prop_free_tickets'])\n",
"plt.xlabel('Numéro de la société')\n",
"plt.ylabel('Proportion de billets gratuits')\n",
"plt.title('Proportion de billets gratuits par musée')\n",
"plt.xticks(df['company_number'])\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "d6de664a-a303-48f5-bca6-1e9e9d17c461",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Répartition des prix de vente"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "150825c6-08b5-44ad-a02e-98ee44192d94",
"metadata": {},
"outputs": [],
"source": [
"boxplot_amount = {} \n",
"\n",
"for company_number in ['1', '2', '3', '4', '101'] :\n",
" nom_dataframe = 'df'+ company_number +'_tickets'\n",
" df_tickets = globals()[nom_dataframe].copy()\n",
" df_notfree_tickets = df_tickets[df_tickets['amount'] > 0]\n",
" \n",
" boxplot_amount[company_number] = df_notfree_tickets['amount']\n",
"\n",
"amount_df = pd.DataFrame(boxplot_amount)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "c6ce46c8-5ad1-42c0-9b9a-a84df52a3411",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>101</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1.062722e+06</td>\n",
" <td>1.475197e+06</td>\n",
" <td>3.051426e+06</td>\n",
" <td>1.280045e+06</td>\n",
" <td>1.133556e+07</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.076436e+01</td>\n",
" <td>1.519766e+01</td>\n",
" <td>1.285360e+01</td>\n",
" <td>1.139475e+01</td>\n",
" <td>1.350509e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>9.243106e+00</td>\n",
" <td>5.714467e+00</td>\n",
" <td>1.445236e+01</td>\n",
" <td>1.657010e+01</td>\n",
" <td>1.492325e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>2.500000e+00</td>\n",
" <td>5.000000e+00</td>\n",
" <td>3.000000e-01</td>\n",
" <td>1.000000e+00</td>\n",
" <td>2.000000e-02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>9.500000e+00</td>\n",
" <td>1.300000e+01</td>\n",
" <td>6.000000e+00</td>\n",
" <td>6.000000e+00</td>\n",
" <td>1.000000e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.100000e+01</td>\n",
" <td>1.500000e+01</td>\n",
" <td>1.350000e+01</td>\n",
" <td>1.000000e+01</td>\n",
" <td>1.300000e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.100000e+01</td>\n",
" <td>1.500000e+01</td>\n",
" <td>1.700000e+01</td>\n",
" <td>1.200000e+01</td>\n",
" <td>1.450000e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>3.200000e+02</td>\n",
" <td>3.000000e+02</td>\n",
" <td>7.500000e+03</td>\n",
" <td>1.500000e+03</td>\n",
" <td>1.633000e+03</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 1 2 3 4 101\n",
"count 1.062722e+06 1.475197e+06 3.051426e+06 1.280045e+06 1.133556e+07\n",
"mean 1.076436e+01 1.519766e+01 1.285360e+01 1.139475e+01 1.350509e+01\n",
"std 9.243106e+00 5.714467e+00 1.445236e+01 1.657010e+01 1.492325e+01\n",
"min 2.500000e+00 5.000000e+00 3.000000e-01 1.000000e+00 2.000000e-02\n",
"25% 9.500000e+00 1.300000e+01 6.000000e+00 6.000000e+00 1.000000e+01\n",
"50% 1.100000e+01 1.500000e+01 1.350000e+01 1.000000e+01 1.300000e+01\n",
"75% 1.100000e+01 1.500000e+01 1.700000e+01 1.200000e+01 1.450000e+01\n",
"max 3.200000e+02 3.000000e+02 7.500000e+03 1.500000e+03 1.633000e+03"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"amount_df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "a54269c1-9aec-4e49-91ba-d39fa5ece850",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"means = amount_df.mean()\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"amount_df.boxplot()\n",
"plt.scatter(x=range(1, len(means) + 1), y=means, marker='D', color='red', s=100)\n",
"plt.title('Répartition des prix des billets non gratuits')\n",
"plt.ylabel('Montant')\n",
"plt.xlabel('Compagnie')\n",
"plt.ylim(0, 50) \n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "b41b5434-0e5b-495b-bede-23f5cb45272c",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>purchase_id</th>\n",
" <th>ticket_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>73518.000000</td>\n",
" <td>7.351800e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>10.096167</td>\n",
" <td>2.484660e+01</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>2367.702603</td>\n",
" <td>4.636993e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>1.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1.000000</td>\n",
" <td>1.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>1.000000</td>\n",
" <td>2.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>1.000000</td>\n",
" <td>3.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>641981.000000</td>\n",
" <td>1.256574e+06</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" purchase_id ticket_id\n",
"count 73518.000000 7.351800e+04\n",
"mean 10.096167 2.484660e+01\n",
"std 2367.702603 4.636993e+03\n",
"min 1.000000 1.000000e+00\n",
"25% 1.000000 1.000000e+00\n",
"50% 1.000000 2.000000e+00\n",
"75% 1.000000 3.000000e+00\n",
"max 641981.000000 1.256574e+06"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"purchases.groupby('customer_id')[['purchase_id', 'ticket_id']].nunique().describe()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "d1212b10-3933-450a-b001-9e2cbf308f79",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>customer_id</th>\n",
" <th>purchase_id</th>\n",
" <th>event_type_id</th>\n",
" <th>supplier_name</th>\n",
" <th>purchase_date</th>\n",
" <th>type_of_ticket_name</th>\n",
" <th>amount</th>\n",
" <th>children</th>\n",
" <th>is_full_price</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilities</th>\n",
" <th>name_categories</th>\n",
" <th>name_events</th>\n",
" <th>name_seasons</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13070859</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>8.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13070860</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>4.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13070861</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>4.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13070862</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>4.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13070863</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>4.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826667</th>\n",
" <td>20662815</td>\n",
" <td>1256135</td>\n",
" <td>8007697</td>\n",
" <td>5</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 17:23:54+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826668</th>\n",
" <td>20662816</td>\n",
" <td>1256136</td>\n",
" <td>8007698</td>\n",
" <td>5</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 18:32:18+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826669</th>\n",
" <td>20662817</td>\n",
" <td>1256136</td>\n",
" <td>8007698</td>\n",
" <td>5</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 18:32:18+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826670</th>\n",
" <td>20662818</td>\n",
" <td>1256137</td>\n",
" <td>8007699</td>\n",
" <td>5</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 19:30:28+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1826671</th>\n",
" <td>20662819</td>\n",
" <td>1256137</td>\n",
" <td>8007699</td>\n",
" <td>5</td>\n",
" <td>vente en ligne</td>\n",
" <td>2023-11-08 19:30:28+00:00</td>\n",
" <td>Atelier</td>\n",
" <td>11.0</td>\n",
" <td>pricing_formula</td>\n",
" <td>False</td>\n",
" <td>offre muséale groupe</td>\n",
" <td>mucem</td>\n",
" <td>indiv entrées tp</td>\n",
" <td>NaN</td>\n",
" <td>2023</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1826672 rows × 15 columns</p>\n",
"</div>"
],
"text/plain": [
" ticket_id customer_id purchase_id event_type_id supplier_name \\\n",
"0 13070859 48187 5107462 4 vente en ligne \n",
"1 13070860 48187 5107462 4 vente en ligne \n",
"2 13070861 48187 5107462 4 vente en ligne \n",
"3 13070862 48187 5107462 4 vente en ligne \n",
"4 13070863 48187 5107462 4 vente en ligne \n",
"... ... ... ... ... ... \n",
"1826667 20662815 1256135 8007697 5 vente en ligne \n",
"1826668 20662816 1256136 8007698 5 vente en ligne \n",
"1826669 20662817 1256136 8007698 5 vente en ligne \n",
"1826670 20662818 1256137 8007699 5 vente en ligne \n",
"1826671 20662819 1256137 8007699 5 vente en ligne \n",
"\n",
" purchase_date type_of_ticket_name amount \\\n",
"0 2018-12-28 14:47:50+00:00 Atelier 8.0 \n",
"1 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
"2 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
"3 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
"4 2018-12-28 14:47:50+00:00 Atelier 4.0 \n",
"... ... ... ... \n",
"1826667 2023-11-08 17:23:54+00:00 Atelier 11.0 \n",
"1826668 2023-11-08 18:32:18+00:00 Atelier 11.0 \n",
"1826669 2023-11-08 18:32:18+00:00 Atelier 11.0 \n",
"1826670 2023-11-08 19:30:28+00:00 Atelier 11.0 \n",
"1826671 2023-11-08 19:30:28+00:00 Atelier 11.0 \n",
"\n",
" children is_full_price name_event_types name_facilities \\\n",
"0 pricing_formula False spectacle vivant mucem \n",
"1 pricing_formula False spectacle vivant mucem \n",
"2 pricing_formula False spectacle vivant mucem \n",
"3 pricing_formula False spectacle vivant mucem \n",
"4 pricing_formula False spectacle vivant mucem \n",
"... ... ... ... ... \n",
"1826667 pricing_formula False offre muséale groupe mucem \n",
"1826668 pricing_formula False offre muséale groupe mucem \n",
"1826669 pricing_formula False offre muséale groupe mucem \n",
"1826670 pricing_formula False offre muséale groupe mucem \n",
"1826671 pricing_formula False offre muséale groupe mucem \n",
"\n",
" name_categories name_events name_seasons \n",
"0 indiv prog enfant l'école des magiciens 2018 \n",
"1 indiv prog enfant l'école des magiciens 2018 \n",
"2 indiv prog enfant l'école des magiciens 2018 \n",
"3 indiv prog enfant l'école des magiciens 2018 \n",
"4 indiv prog enfant l'école des magiciens 2018 \n",
"... ... ... ... \n",
"1826667 indiv entrées tp NaN 2023 \n",
"1826668 indiv entrées tp NaN 2023 \n",
"1826669 indiv entrées tp NaN 2023 \n",
"1826670 indiv entrées tp NaN 2023 \n",
"1826671 indiv entrées tp NaN 2023 \n",
"\n",
"[1826672 rows x 15 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"purchases"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "49d5fd2d-9bc1-43ac-9270-1efd73759854",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 800x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Nombre Total de tickets achetés sur Internet par Type d'évènements\n",
"\n",
"nb_tickets_internet = customer.groupby('name_event_types')['nb_tickets_internet'].sum()\n",
"nb_tickets_internet.plot(kind='bar', figsize=(8, 5))\n",
"plt.xlabel(\"Type d'évènements\")\n",
"plt.ylabel('Nombre Total de tickets achetés sur Internet')\n",
"plt.title(\"Nombre Total de tickets achetés sur Internet par Type d'évènements\")\n",
"plt.xticks(rotation=45)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc071992-cf4d-4b9f-9c3b-3f0e98e20eff",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}