2024-03-23 00:04:49 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "84b6e27e-4bda-4d38-8689-ec7fc0da1848",
"metadata": {},
"source": [
"# Define segment and predict sales associated"
]
},
{
"cell_type": "markdown",
"id": "ec059482-45d3-4ae6-99bc-9b4ced115db3",
"metadata": {},
"source": [
"## Importations of packages "
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 1,
2024-03-23 00:04:49 +01:00
"id": "9771bf29-d08e-4674-8c23-9a2672fbef8f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
2024-03-23 10:18:43 +01:00
"from pandas import DataFrame\n",
2024-03-23 00:04:49 +01:00
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n",
"from sklearn.utils import class_weight\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
"from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from scipy.optimize import fsolve\n",
2024-03-24 10:42:44 +01:00
"import io\n",
2024-03-23 00:04:49 +01:00
"\n",
"import pickle\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "048fcd7c-800a-4a6b-b725-faf8410f924a",
"metadata": {},
"source": [
"## load databases"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 2,
2024-03-23 00:04:49 +01:00
"id": "539ccbdf-f29f-4f04-99c1-8c88d0efe514",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 75,
2024-03-27 18:58:30 +01:00
"id": "d6017ed0-6233-4888-85a7-05dec50a255b",
"metadata": {},
"outputs": [],
"source": [
2024-03-30 12:00:49 +01:00
"type_of_activity = \"musique\""
2024-03-27 18:58:30 +01:00
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 4,
2024-03-23 00:04:49 +01:00
"id": "0c3a6ddc-9345-4a42-b6bf-a20a95de3028",
"metadata": {},
"outputs": [],
"source": [
2024-03-27 18:58:30 +01:00
"def load_train_test(type_of_activity):\n",
2024-03-30 12:00:49 +01:00
" # BUCKET = f\"projet-bdc2324-team1/Generalization/{type_of_activity}\"\n",
" BUCKET = f\"projet-bdc2324-team1/Generalization_v2/{type_of_activity}\"\n",
2024-03-23 00:04:49 +01:00
" File_path_train = BUCKET + \"/Train_set.csv\"\n",
" File_path_test = BUCKET + \"/Test_set.csv\"\n",
" \n",
" with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
"\n",
" with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
" \n",
" return dataset_train, dataset_test"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 76,
2024-03-23 00:04:49 +01:00
"id": "2831d546-b365-498b-8248-c618bd9c3057",
"metadata": {},
2024-03-23 10:18:43 +01:00
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-03-30 12:00:49 +01:00
"/tmp/ipykernel_552/3983721681.py:8: DtypeWarning: Columns (10,19,20,21,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
"/tmp/ipykernel_552/3983721681.py:12: DtypeWarning: Columns (19,20,21,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
2024-03-23 10:18:43 +01:00
]
},
{
"data": {
"text/plain": [
2024-03-30 12:00:49 +01:00
"customer_id 0\n",
"street_id 0\n",
"structure_id 327020\n",
"mcp_contact_id 135470\n",
"fidelity 0\n",
" ... \n",
"purchases_8_2021 113963\n",
"purchases_8_2022 0\n",
"purchases_9_2021 113963\n",
"purchases_9_2022 0\n",
"y_has_purchased 0\n",
"Length: 87, dtype: int64"
2024-03-23 10:18:43 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 76,
2024-03-23 10:18:43 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
2024-03-23 00:04:49 +01:00
"source": [
2024-03-27 18:58:30 +01:00
"dataset_train, dataset_test = load_train_test(type_of_activity)\n",
2024-03-23 00:04:49 +01:00
"dataset_train.isna().sum()"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 17,
2024-03-23 00:04:49 +01:00
"id": "b8827f7b-b304-4f51-9814-c7a98ed88cf0",
"metadata": {},
"outputs": [],
"source": [
"def features_target_split(dataset_train, dataset_test):\n",
" \n",
2024-03-30 12:00:49 +01:00
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'purchase_date_min', 'purchase_date_max', \n",
" 'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',, 'vente_internet_max'\n",
2024-03-23 00:04:49 +01:00
" 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n",
"\n",
" # we suppress fidelity, time between purchase, and gender other (colinearity issue)\n",
" \"\"\"\n",
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', \n",
" 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true', \n",
" 'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']\n",
" \"\"\"\n",
" \n",
2024-03-30 12:00:49 +01:00
" X_train = dataset_train # [features_l]\n",
2024-03-23 00:04:49 +01:00
" y_train = dataset_train[['y_has_purchased']]\n",
"\n",
2024-03-30 12:00:49 +01:00
" X_test = dataset_test # [features_l]\n",
2024-03-23 00:04:49 +01:00
" y_test = dataset_test[['y_has_purchased']]\n",
" return X_train, X_test, y_train, y_test"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 77,
2024-03-23 00:04:49 +01:00
"id": "c18195fc-ed40-4e39-a59e-c9ecc5a8e6c3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-03-30 12:00:49 +01:00
"Shape train : (354365, 87)\n",
"Shape test : (151874, 87)\n"
2024-03-23 00:04:49 +01:00
]
}
],
"source": [
"X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)\n",
"print(\"Shape train : \", X_train.shape)\n",
"print(\"Shape test : \", X_test.shape)"
]
},
{
"cell_type": "markdown",
"id": "74eda066-5e01-43aa-b0cf-cc6d9bbf770e",
"metadata": {},
"source": [
"## get results from the logit cross validated model"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 78,
2024-03-23 00:04:49 +01:00
"id": "7c81390e-598c-4f02-bd56-dd03b00dcb33",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-03-30 12:00:49 +01:00
" <th>customer_id</th>\n",
" <th>street_id</th>\n",
" <th>structure_id</th>\n",
" <th>mcp_contact_id</th>\n",
2024-03-23 00:04:49 +01:00
" <th>fidelity</th>\n",
2024-03-30 12:00:49 +01:00
" <th>tenant_id</th>\n",
" <th>is_partner</th>\n",
" <th>deleted_at</th>\n",
2024-03-23 00:04:49 +01:00
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
2024-03-30 12:00:49 +01:00
" <th>...</th>\n",
" <th>purchases_5_2022</th>\n",
" <th>purchases_6_2021</th>\n",
" <th>purchases_6_2022</th>\n",
" <th>purchases_7_2021</th>\n",
" <th>purchases_7_2022</th>\n",
" <th>purchases_8_2021</th>\n",
" <th>purchases_8_2022</th>\n",
" <th>purchases_9_2021</th>\n",
" <th>purchases_9_2022</th>\n",
" <th>y_has_purchased</th>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_699783</td>\n",
" <td>139</td>\n",
" <td>NaN</td>\n",
" <td>186852.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_38307</td>\n",
" <td>862</td>\n",
" <td>NaN</td>\n",
" <td>17621.0</td>\n",
" <td>7</td>\n",
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>True</td>\n",
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_556101</td>\n",
" <td>1063</td>\n",
" <td>NaN</td>\n",
" <td>136909.0</td>\n",
" <td>0</td>\n",
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>True</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_686663</td>\n",
" <td>443226</td>\n",
" <td>NaN</td>\n",
" <td>186611.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>875</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-23 00:04:49 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>10_91656</td>\n",
" <td>316684</td>\n",
" <td>NaN</td>\n",
" <td>21559.0</td>\n",
" <td>2</td>\n",
" <td>875</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-03-30 12:00:49 +01:00
" <th>151869</th>\n",
" <td>14_1843791</td>\n",
" <td>718883</td>\n",
" <td>224.0</td>\n",
" <td>394849.0</td>\n",
" <td>1</td>\n",
" <td>862</td>\n",
2024-03-23 00:04:49 +01:00
" <td>False</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-03-30 12:00:49 +01:00
" <th>151870</th>\n",
" <td>14_4630858</td>\n",
" <td>741826</td>\n",
" <td>NaN</td>\n",
" <td>1555631.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>862</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-23 00:04:49 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
2024-03-30 12:00:49 +01:00
" <th>151871</th>\n",
" <td>14_4659926</td>\n",
" <td>871477</td>\n",
" <td>NaN</td>\n",
" <td>1542180.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>862</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151872</th>\n",
" <td>14_4881492</td>\n",
" <td>917272</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>862</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151873</th>\n",
" <td>14_8124</td>\n",
" <td>2762</td>\n",
" <td>NaN</td>\n",
" <td>10077.0</td>\n",
" <td>2</td>\n",
" <td>862</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-03-30 12:00:49 +01:00
"<p>151874 rows × 87 columns</p>\n",
2024-03-23 00:04:49 +01:00
"</div>"
],
"text/plain": [
2024-03-30 12:00:49 +01:00
" customer_id street_id structure_id mcp_contact_id fidelity \\\n",
"0 10_699783 139 NaN 186852.0 0 \n",
"1 10_38307 862 NaN 17621.0 7 \n",
"2 10_556101 1063 NaN 136909.0 0 \n",
"3 10_686663 443226 NaN 186611.0 1 \n",
"4 10_91656 316684 NaN 21559.0 2 \n",
"... ... ... ... ... ... \n",
"151869 14_1843791 718883 224.0 394849.0 1 \n",
"151870 14_4630858 741826 NaN 1555631.0 0 \n",
"151871 14_4659926 871477 NaN 1542180.0 0 \n",
"151872 14_4881492 917272 NaN NaN 1 \n",
"151873 14_8124 2762 NaN 10077.0 2 \n",
"\n",
" tenant_id is_partner deleted_at is_email_true opt_in ... \\\n",
"0 875 False NaN True 0 ... \n",
"1 875 False NaN True 0 ... \n",
"2 875 False NaN True 1 ... \n",
"3 875 False NaN True 1 ... \n",
"4 875 False NaN True 0 ... \n",
"... ... ... ... ... ... ... \n",
"151869 862 False NaN True 1 ... \n",
"151870 862 False NaN True 1 ... \n",
"151871 862 False NaN True 1 ... \n",
"151872 862 False NaN True 1 ... \n",
"151873 862 False NaN True 0 ... \n",
"\n",
" purchases_5_2022 purchases_6_2021 purchases_6_2022 purchases_7_2021 \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"151869 0.0 NaN 0.0 NaN \n",
"151870 0.0 NaN 0.0 NaN \n",
"151871 0.0 NaN 0.0 NaN \n",
"151872 0.0 NaN 0.0 NaN \n",
"151873 0.0 NaN 0.0 NaN \n",
"\n",
" purchases_7_2022 purchases_8_2021 purchases_8_2022 \\\n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 1.0 \n",
"4 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"151869 0.0 NaN 0.0 \n",
"151870 0.0 NaN 0.0 \n",
"151871 0.0 NaN 0.0 \n",
"151872 0.0 NaN 0.0 \n",
"151873 0.0 NaN 0.0 \n",
"\n",
" purchases_9_2021 purchases_9_2022 y_has_purchased \n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"... ... ... ... \n",
"151869 NaN 0.0 0.0 \n",
"151870 NaN 0.0 0.0 \n",
"151871 NaN 0.0 0.0 \n",
"151872 NaN 1.0 0.0 \n",
"151873 NaN 0.0 0.0 \n",
"\n",
"[151874 rows x 87 columns]"
2024-03-23 00:04:49 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 78,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 20,
2024-03-23 00:04:49 +01:00
"id": "c708f439-bb75-4688-bf4f-4c04e13deaae",
"metadata": {},
"outputs": [],
"source": [
"def load_model(type_of_activity, model):\n",
2024-03-30 12:00:49 +01:00
" # BUCKET = f\"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/\"\n",
" BUCKET = f\"projet-bdc2324-team1/basique/{type_of_activity}/{model}/\"\n",
2024-03-23 00:04:49 +01:00
" filename = model + '.pkl'\n",
" file_path = BUCKET + filename\n",
" with fs.open(file_path, mode=\"rb\") as f:\n",
" model_bytes = f.read()\n",
"\n",
" model = pickle.loads(model_bytes)\n",
" return model"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 92,
2024-03-23 00:04:49 +01:00
"id": "5261a803-05b8-41a0-968c-dc7bde48ddd3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
2024-03-30 12:00:49 +01:00
"<style>#sk-container-id-7 {\n",
2024-03-23 00:04:49 +01:00
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 pre {\n",
2024-03-23 00:04:49 +01:00
" padding: 0;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 input.sk-hidden--visually {\n",
2024-03-23 00:04:49 +01:00
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-dashed-wrapped {\n",
2024-03-23 00:04:49 +01:00
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-container {\n",
2024-03-23 00:04:49 +01:00
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-text-repr-fallback {\n",
2024-03-23 00:04:49 +01:00
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-parallel-item::after {\n",
2024-03-23 00:04:49 +01:00
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-parallel {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-parallel-item {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-parallel-item:first-child::after {\n",
2024-03-23 00:04:49 +01:00
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-parallel-item:last-child::after {\n",
2024-03-23 00:04:49 +01:00
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-parallel-item:only-child::after {\n",
2024-03-23 00:04:49 +01:00
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-serial {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-toggleable {\n",
2024-03-23 00:04:49 +01:00
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 label.sk-toggleable__label-arrow:before {\n",
2024-03-23 00:04:49 +01:00
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 label.sk-toggleable__label-arrow:hover:before {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-toggleable__content {\n",
2024-03-23 00:04:49 +01:00
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-toggleable__content.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-toggleable__content pre {\n",
2024-03-23 00:04:49 +01:00
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-toggleable__content.fitted pre {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
2024-03-23 00:04:49 +01:00
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
2024-03-23 00:04:49 +01:00
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-7 div.sk-label label {\n",
2024-03-23 00:04:49 +01:00
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-label:hover label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-label label {\n",
2024-03-23 00:04:49 +01:00
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-label-container {\n",
2024-03-23 00:04:49 +01:00
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-estimator {\n",
2024-03-23 00:04:49 +01:00
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-estimator.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-estimator:hover {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 div.sk-estimator.fitted:hover {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 a.estimator_doc_link {\n",
2024-03-23 00:04:49 +01:00
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 a.estimator_doc_link.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 a.estimator_doc_link:hover {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
2024-03-30 12:00:49 +01:00
"#sk-container-id-7 a.estimator_doc_link.fitted:hover {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
2024-03-30 12:00:49 +01:00
"</style><div id=\"sk-container-id-7\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value=0,\n",
" strategy='constant')),\n",
" ('scaler',\n",
" StandardScaler())]),\n",
" ['nb_campaigns',\n",
" 'taux_ouverture_mail',\n",
" 'prop_purchases_internet',\n",
" 'nb_tickets', 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'purchases_10_2021',\n",
" 'purchases_10_2022',\n",
" 'purchases_...\n",
" 'categorie_age_40_50',\n",
" 'categorie_age_50_60',\n",
" 'categorie_age_60_70',\n",
" 'categorie_age_70_80',\n",
" 'categorie_age_plus_80',\n",
" 'categorie_age_inconnue',\n",
" 'country_fr',\n",
" 'is_profession_known',\n",
" 'is_zipcode_known',\n",
" 'opt_in'])])),\n",
" ('LogisticRegression_Benchmark',\n",
" LogisticRegression(class_weight={0.0: 0.5480249666729557,\n",
" 1.0: 5.705625684291879},\n",
" max_iter=5000, n_jobs=-1, solver='saga'))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-50\" type=\"checkbox\" ><label for=\"sk-estimator-id-50\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[('preprocessor',\n",
2024-03-23 00:04:49 +01:00
" ColumnTransformer(transformers=[('num',\n",
2024-03-30 12:00:49 +01:00
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value=0,\n",
" strategy='constant')),\n",
" ('scaler',\n",
2024-03-23 00:04:49 +01:00
" StandardScaler())]),\n",
2024-03-30 12:00:49 +01:00
" ['nb_campaigns',\n",
" 'taux_ouverture_mail',\n",
" 'prop_purchases_internet',\n",
" 'nb_tickets', 'nb_purchases',\n",
2024-03-23 00:04:49 +01:00
" 'total_amount',\n",
" 'nb_suppliers',\n",
2024-03-30 12:00:49 +01:00
" 'purchases_10_2021',\n",
" 'purchases_10_2022',\n",
" 'purchases_...\n",
" 'categorie_age_40_50',\n",
" 'categorie_age_50_60',\n",
" 'categorie_age_60_70',\n",
" 'categorie_age_70_80',\n",
" 'categorie_age_plus_80',\n",
" 'categorie_age_inconnue',\n",
" 'country_fr',\n",
" 'is_profession_known',\n",
" 'is_zipcode_known',\n",
" 'opt_in'])])),\n",
" ('LogisticRegression_Benchmark',\n",
" LogisticRegression(class_weight={0.0: 0.5480249666729557,\n",
" 1.0: 5.705625684291879},\n",
" max_iter=5000, n_jobs=-1, solver='saga'))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-51\" type=\"checkbox\" ><label for=\"sk-estimator-id-51\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value=0,\n",
" strategy='constant')),\n",
" ('scaler', StandardScaler())]),\n",
" ['nb_campaigns', 'taux_ouverture_mail',\n",
" 'prop_purchases_internet', 'nb_tickets',\n",
" 'nb_purchases', 'total_amount',\n",
" 'nb_suppliers', 'purchases_10_2021',\n",
" 'purchases_10_2022', 'purchases_11_2021',\n",
" 'purchases_12_2021', 'pu...\n",
" SimpleImputer(strategy='most_frequent'))]),\n",
" ['gender_female', 'gender_male',\n",
" 'achat_internet', 'categorie_age_0_10',\n",
" 'categorie_age_10_20', 'categorie_age_20_30',\n",
" 'categorie_age_30_40', 'categorie_age_40_50',\n",
" 'categorie_age_50_60', 'categorie_age_60_70',\n",
" 'categorie_age_70_80',\n",
" 'categorie_age_plus_80',\n",
" 'categorie_age_inconnue', 'country_fr',\n",
" 'is_profession_known', 'is_zipcode_known',\n",
" 'opt_in'])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-52\" type=\"checkbox\" ><label for=\"sk-estimator-id-52\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">num</label><div class=\"sk-toggleable__content fitted\"><pre>['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'purchases_10_2021', 'purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021', 'purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022', 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021', 'purchases_8_2022', 'purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-53\" type=\"checkbox\" ><label for=\"sk-estimator-id-53\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> SimpleImputer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.impute.SimpleImputer.html\">?<span>Documentation for SimpleImputer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>SimpleImputer(fill_value=0, strategy='constant')</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-54\" type=\"checkbox\" ><label for=\"sk-estimator-id-54\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> StandardScaler<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>StandardScaler()</pre></div> </div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-55\" type=\"checkbox\" ><label for=\"sk-estimator-id-55\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">bin</label><div class=\"sk-toggleable__content fitted\"><pre>['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30', 'categorie_age_30_40', 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80', 'categorie_age_inconnue', 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in']</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-56\" type=\"checkbox\" ><label for=\"sk-estimator-id-56\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fi
" 1.0: 5.705625684291879},\n",
" max_iter=5000, n_jobs=-1, solver='saga')</pre></div> </div></div></div></div></div></div>"
2024-03-23 00:04:49 +01:00
],
"text/plain": [
2024-03-30 12:00:49 +01:00
"Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value=0,\n",
" strategy='constant')),\n",
" ('scaler',\n",
" StandardScaler())]),\n",
" ['nb_campaigns',\n",
" 'taux_ouverture_mail',\n",
" 'prop_purchases_internet',\n",
" 'nb_tickets', 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'purchases_10_2021',\n",
" 'purchases_10_2022',\n",
" 'purchases_...\n",
" 'categorie_age_40_50',\n",
" 'categorie_age_50_60',\n",
" 'categorie_age_60_70',\n",
" 'categorie_age_70_80',\n",
" 'categorie_age_plus_80',\n",
" 'categorie_age_inconnue',\n",
" 'country_fr',\n",
" 'is_profession_known',\n",
" 'is_zipcode_known',\n",
" 'opt_in'])])),\n",
" ('LogisticRegression_Benchmark',\n",
" LogisticRegression(class_weight={0.0: 0.5480249666729557,\n",
" 1.0: 5.705625684291879},\n",
" max_iter=5000, n_jobs=-1, solver='saga'))])"
2024-03-23 00:04:49 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 92,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-03-30 12:00:49 +01:00
"model = load_model(type_of_activity, \"LogisticRegression_Benchmark\")\n",
2024-03-28 11:37:23 +01:00
"# model = load_model(type_of_activity, \"randomF_cv\")\n",
"model"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "markdown",
"id": "006819e7-e9c5-48d9-85ee-aa43d5e4c9c2",
"metadata": {},
"source": [
"## Quartile clustering"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 93,
2024-03-23 00:04:49 +01:00
"id": "018d8ff4-3436-4eec-8507-d1a265cbabf1",
"metadata": {},
"outputs": [],
"source": [
2024-03-28 11:37:23 +01:00
"y_pred = model.predict(X_test)\n",
"y_pred_prob = model.predict_proba(X_test)[:, 1]"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 94,
2024-03-23 00:04:49 +01:00
"id": "846f53b9-73c2-4a8b-9d9e-f11bf59ce9ba",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-03-30 12:00:49 +01:00
" <th>customer_id</th>\n",
" <th>street_id</th>\n",
" <th>structure_id</th>\n",
" <th>mcp_contact_id</th>\n",
2024-03-23 00:04:49 +01:00
" <th>fidelity</th>\n",
2024-03-30 12:00:49 +01:00
" <th>tenant_id</th>\n",
" <th>is_partner</th>\n",
" <th>deleted_at</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
2024-03-23 00:04:49 +01:00
" <th>...</th>\n",
2024-03-30 12:00:49 +01:00
" <th>purchases_8_2021</th>\n",
" <th>purchases_8_2022</th>\n",
" <th>purchases_9_2021</th>\n",
" <th>purchases_9_2022</th>\n",
" <th>y_has_purchased</th>\n",
2024-03-23 00:04:49 +01:00
" <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
2024-03-27 18:58:30 +01:00
" <th>score_adjusted</th>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_699783</td>\n",
" <td>139</td>\n",
" <td>NaN</td>\n",
" <td>186852.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.131180</td>\n",
" <td>1</td>\n",
" <td>0.017574</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_38307</td>\n",
" <td>862</td>\n",
" <td>NaN</td>\n",
" <td>17621.0</td>\n",
" <td>7</td>\n",
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.321635</td>\n",
" <td>2</td>\n",
" <td>0.042466</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>10_556101</td>\n",
" <td>1063</td>\n",
" <td>NaN</td>\n",
" <td>136909.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.005068</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.000676</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_686663</td>\n",
" <td>443226</td>\n",
" <td>NaN</td>\n",
" <td>186611.0</td>\n",
" <td>1</td>\n",
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>1.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.166979</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.018397</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_91656</td>\n",
" <td>316684</td>\n",
" <td>NaN</td>\n",
" <td>21559.0</td>\n",
" <td>2</td>\n",
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.161523</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.018632</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_35956</td>\n",
" <td>106204</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.098139</td>\n",
" <td>1</td>\n",
" <td>0.010129</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_560058</td>\n",
" <td>1063</td>\n",
" <td>NaN</td>\n",
" <td>161812.0</td>\n",
" <td>0</td>\n",
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.005377</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.000715</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_38603</td>\n",
" <td>513642</td>\n",
" <td>1865.0</td>\n",
" <td>7660.0</td>\n",
" <td>4</td>\n",
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>1.0</td>\n",
" <td>1.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.906698</td>\n",
" <td>4</td>\n",
" <td>0.461388</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>10_563294</td>\n",
" <td>1063</td>\n",
" <td>NaN</td>\n",
" <td>167549.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.007399</td>\n",
" <td>1</td>\n",
" <td>0.000974</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
2024-03-30 12:00:49 +01:00
" <td>10_548983</td>\n",
" <td>268636</td>\n",
" <td>NaN</td>\n",
" <td>173318.0</td>\n",
" <td>1</td>\n",
" <td>875</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.163529</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.022102</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-03-30 12:00:49 +01:00
"<p>10 rows × 92 columns</p>\n",
2024-03-23 00:04:49 +01:00
"</div>"
],
"text/plain": [
2024-03-30 12:00:49 +01:00
" customer_id street_id structure_id mcp_contact_id fidelity tenant_id \\\n",
"0 10_699783 139 NaN 186852.0 0 875 \n",
"1 10_38307 862 NaN 17621.0 7 875 \n",
"2 10_556101 1063 NaN 136909.0 0 875 \n",
"3 10_686663 443226 NaN 186611.0 1 875 \n",
"4 10_91656 316684 NaN 21559.0 2 875 \n",
"5 10_35956 106204 NaN NaN 1 875 \n",
"6 10_560058 1063 NaN 161812.0 0 875 \n",
"7 10_38603 513642 1865.0 7660.0 4 875 \n",
"8 10_563294 1063 NaN 167549.0 0 875 \n",
"9 10_548983 268636 NaN 173318.0 1 875 \n",
"\n",
" is_partner deleted_at is_email_true opt_in ... purchases_8_2021 \\\n",
"0 False NaN True 0 ... 0.0 \n",
"1 False NaN True 0 ... 0.0 \n",
"2 False NaN True 1 ... 0.0 \n",
"3 False NaN True 1 ... 0.0 \n",
"4 False NaN True 0 ... 0.0 \n",
"5 False NaN True 0 ... 0.0 \n",
"6 False NaN True 1 ... 0.0 \n",
"7 False NaN True 1 ... 0.0 \n",
"8 False NaN True 1 ... 0.0 \n",
"9 False NaN True 0 ... 0.0 \n",
"\n",
" purchases_8_2022 purchases_9_2021 purchases_9_2022 y_has_purchased \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 1.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"5 0.0 0.0 0.0 0.0 \n",
"6 0.0 0.0 0.0 0.0 \n",
"7 0.0 0.0 0.0 1.0 \n",
"8 0.0 0.0 0.0 0.0 \n",
"9 0.0 0.0 0.0 0.0 \n",
"\n",
" has_purchased has_purchased_estim score quartile score_adjusted \n",
"0 0.0 0.0 0.131180 1 0.017574 \n",
"1 0.0 0.0 0.321635 2 0.042466 \n",
"2 0.0 0.0 0.005068 1 0.000676 \n",
"3 0.0 0.0 0.166979 1 0.018397 \n",
"4 0.0 0.0 0.161523 1 0.018632 \n",
"5 0.0 0.0 0.098139 1 0.010129 \n",
"6 0.0 0.0 0.005377 1 0.000715 \n",
"7 1.0 1.0 0.906698 4 0.461388 \n",
"8 0.0 0.0 0.007399 1 0.000974 \n",
"9 0.0 0.0 0.163529 1 0.022102 \n",
"\n",
"[10 rows x 92 columns]"
2024-03-23 00:04:49 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 94,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment = X_test\n",
"\n",
"X_test_segment[\"has_purchased\"] = y_test\n",
"X_test_segment[\"has_purchased_estim\"] = y_pred\n",
"X_test_segment[\"score\"] = y_pred_prob\n",
"X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n",
" np.where(X_test['score']<0.5, '2',\n",
" np.where(X_test['score']<0.75, '3', '4')))\n",
"X_test_segment.head(10)"
]
},
2024-03-27 18:58:30 +01:00
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 24,
2024-03-27 18:58:30 +01:00
"id": "fb592fe3-ea40-4e83-8fe9-c52b9ee42f2a",
"metadata": {},
"outputs": [],
"source": [
"def df_segment(df, y, model) :\n",
"\n",
" y_pred = model.predict(df)\n",
" y_pred_prob = model.predict_proba(df)[:, 1]\n",
"\n",
" df_segment = df\n",
"\n",
" df_segment[\"has_purchased\"] = y\n",
" df_segment[\"has_purchased_estim\"] = y_pred\n",
" df_segment[\"score\"] = y_pred_prob\n",
" df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n",
" np.where(df_segment['score']<0.5, '2',\n",
" np.where(df_segment['score']<0.75, '3', '4')))\n",
"\n",
" return df_segment"
]
},
{
"cell_type": "code",
2024-03-28 11:37:23 +01:00
"execution_count": 88,
2024-03-27 18:58:30 +01:00
"id": "968645d5-58cc-485a-bd8b-99f4cfc26fec",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1080/2624515794.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"has_purchased\"] = y\n",
"/tmp/ipykernel_1080/2624515794.py:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"has_purchased_estim\"] = y_pred\n",
"/tmp/ipykernel_1080/2624515794.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"score\"] = y_pred_prob\n",
"/tmp/ipykernel_1080/2624515794.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>fidelity</th>\n",
" <th>...</th>\n",
2024-03-28 11:37:23 +01:00
" <th>opt_in</th>\n",
2024-03-27 18:58:30 +01:00
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-03-28 11:37:23 +01:00
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.006066</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>True</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.288847</td>\n",
2024-03-27 18:58:30 +01:00
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-03-28 11:37:23 +01:00
" <td>17.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>80.00</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>True</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.103264</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-03-28 11:37:23 +01:00
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.008928</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-03-28 11:37:23 +01:00
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>363.061678</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>4</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.992809</td>\n",
" <td>4</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>1</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0</td>\n",
" <td>15.0</td>\n",
" <td>5.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.351762</td>\n",
2024-03-27 18:58:30 +01:00
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>12.0</td>\n",
" <td>9.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>1.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.567814</td>\n",
2024-03-27 18:58:30 +01:00
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96093</th>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.00</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
2024-03-28 11:37:23 +01:00
" <td>-1.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>True</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>29.0</td>\n",
" <td>3.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.004652</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>20.0</td>\n",
" <td>4.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.293042</td>\n",
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96095</th>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.00</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
2024-03-28 11:37:23 +01:00
" <td>-1.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>31.0</td>\n",
" <td>4.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
" <td>0.787852</td>\n",
" <td>4</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-03-28 11:37:23 +01:00
"<p>96096 rows × 21 columns</p>\n",
2024-03-27 18:58:30 +01:00
"</div>"
],
"text/plain": [
2024-03-28 11:37:23 +01:00
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" time_between_purchase nb_tickets_internet fidelity ... opt_in \\\n",
"0 0.000000 0.0 1 ... False \n",
"1 0.000000 0.0 2 ... True \n",
"2 0.000000 0.0 2 ... True \n",
"3 0.000000 0.0 1 ... False \n",
"4 363.061678 0.0 4 ... False \n",
"... ... ... ... ... ... \n",
"96091 0.000000 1.0 2 ... False \n",
"96092 0.000000 1.0 1 ... False \n",
"96093 -1.000000 0.0 1 ... True \n",
"96094 0.000000 1.0 1 ... False \n",
"96095 -1.000000 0.0 2 ... False \n",
"\n",
" gender_female gender_male gender_other nb_campaigns \\\n",
"0 1 0 0 0.0 \n",
"1 0 1 0 0.0 \n",
"2 1 0 0 0.0 \n",
"3 1 0 0 0.0 \n",
"4 1 0 0 0.0 \n",
"... ... ... ... ... \n",
"96091 0 1 0 15.0 \n",
"96092 0 1 0 12.0 \n",
"96093 1 0 0 29.0 \n",
"96094 0 1 0 20.0 \n",
"96095 0 1 0 31.0 \n",
"\n",
" nb_campaigns_opened has_purchased has_purchased_estim score \\\n",
"0 0.0 0.0 0.0 0.006066 \n",
"1 0.0 1.0 0.0 0.288847 \n",
"2 0.0 0.0 0.0 0.103264 \n",
"3 0.0 0.0 0.0 0.008928 \n",
"4 0.0 1.0 1.0 0.992809 \n",
"... ... ... ... ... \n",
"96091 5.0 1.0 0.0 0.351762 \n",
"96092 9.0 0.0 1.0 0.567814 \n",
"96093 3.0 0.0 0.0 0.004652 \n",
"96094 4.0 0.0 0.0 0.293042 \n",
"96095 4.0 0.0 1.0 0.787852 \n",
"\n",
" quartile \n",
"0 1 \n",
"1 2 \n",
"2 1 \n",
"3 1 \n",
"4 4 \n",
"... ... \n",
"96091 2 \n",
"96092 3 \n",
"96093 1 \n",
"96094 2 \n",
"96095 4 \n",
"\n",
"[96096 rows x 21 columns]"
2024-03-27 18:58:30 +01:00
]
},
2024-03-28 11:37:23 +01:00
"execution_count": 88,
2024-03-27 18:58:30 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-03-28 11:37:23 +01:00
"df_segment(X_test, y_test, model)"
2024-03-27 18:58:30 +01:00
]
},
2024-03-23 00:04:49 +01:00
{
"cell_type": "markdown",
"id": "ad16b8ab-7e01-404b-971e-866e9b9d5aa4",
"metadata": {},
"source": [
"## definition of functions to compute the bias of scores and adjust it \n",
"\n",
"Le biais est calculé de la façon suivante. \n",
"En notant $\\hat{p(x_i)}$ le score calculé (estimé par la modélisation) et $p(x_i)$ le vrai score (sans biais), et $\\beta$ le logarithme du biais, on a : \\\n",
"$\\ln{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}} = \\beta + \\ln{\\frac{p(x_i)}{1-p(x_i)}}$ \\\n",
"$ \\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}} = \\exp(\\beta) . \\frac{p(x_i)}{1-p(x_i)} $ , soit : \\\n",
"$p(x_i) = {\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}}$ \\\n",
"Ce qu'on appelle biais et qu'on estime dans le code par la suite est : $B=\\exp(\\beta) $. Les probabilités ne sont donc pas biaisées si $B=1$. Il y a surestimation si $B>1$. \n",
"\n",
"On cherche le B qui permette d'ajuster les probabilités de telle sorte que la somme des scores soit égale à la somme des y_has_purchased. Cela revient à résoudre : \n",
"\n",
"\\begin{equation}\n",
"\\sum_{i}{\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}} = \\sum_{i}{Y_i}\n",
"\\end{equation}\n",
"\n",
"C'est ce que fait la fonction find_bias. \n",
"\n",
"Note sur les notations : \\\n",
"$\\hat{p(x_i)}$ correspond à ce qu'on appelle le score et $p(x_i)$ à ce qu'on appellera le score adjusted"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 25,
2024-03-23 00:04:49 +01:00
"id": "f0379536-a6c5-4b16-bde5-d0319ec1b140",
"metadata": {},
"outputs": [],
"source": [
"# compute adjusted score from odd ratios (cf formula above)\n",
"def adjusted_score(odd_ratio, bias) :\n",
" adjusted_score = odd_ratio/(bias+odd_ratio)\n",
" return adjusted_score"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 26,
2024-03-23 00:04:49 +01:00
"id": "32a0dfd0-f49d-4785-a56f-706d381bfe41",
"metadata": {},
"outputs": [],
"source": [
"# when the score is 1 we cannot compute the odd ratio, so we adjust scores equal to 1\n",
"# we set the second best score instead\n",
"\n",
"def adjust_score_1(score) :\n",
" second_best_score = np.array([element for element in score if element !=1]).max()\n",
" new_score = np.array([element if element!=1 else second_best_score for element in score]) \n",
" return new_score"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 27,
2024-03-23 00:04:49 +01:00
"id": "2dff1def-02df-413e-afce-b4aeaf7752b6",
"metadata": {},
"outputs": [],
"source": [
"def odd_ratio(score) :\n",
" return score / (1 - score)"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 28,
2024-03-23 00:04:49 +01:00
"id": "683d71fc-7442-4028-869c-49c57592d6e9",
"metadata": {},
"outputs": [],
"source": [
"# definition of a function that automatically detects the bias\n",
"\n",
"def find_bias(odd_ratios, y_objective, initial_guess=6) :\n",
" \"\"\"\n",
" results = minimize(lambda bias : (sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective)**2 ,\n",
" initial_guess , method = \"BFGS\")\n",
"\n",
" estimated_bias = results.x[0]\n",
" \"\"\"\n",
"\n",
" # faster method\n",
" bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n",
" \n",
" return bias_estimated[0]"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 95,
2024-03-28 11:37:23 +01:00
"id": "f17dc6ca-7a48-441b-8c04-11c47b8b9741",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-03-30 12:00:49 +01:00
"0.3000275047453295 0.08797424180570736\n"
2024-03-28 11:37:23 +01:00
]
},
{
"data": {
"text/plain": [
2024-03-30 12:00:49 +01:00
"0.08763280798047211"
2024-03-28 11:37:23 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 95,
2024-03-28 11:37:23 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(X_test_segment[\"score\"].mean(), y_test[\"y_has_purchased\"].mean())\n",
"y_train[\"y_has_purchased\"].mean()"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 96,
2024-03-23 00:04:49 +01:00
"id": "781b0d40-c954-4c54-830a-e709c8667328",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2024-03-30 12:00:49 +01:00
"10.698758485840244"
2024-03-23 00:04:49 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 96,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# computation with the function defined\n",
"\n",
"bias_test_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_test_segment[\"score\"])), \n",
" y_objective = y_test[\"y_has_purchased\"].sum(),\n",
" initial_guess=6)\n",
"bias_test_set"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 97,
2024-03-23 00:04:49 +01:00
"id": "248cb862-418e-4767-9933-70c4885ecf40",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2024-03-30 12:00:49 +01:00
"10.688693734338177"
2024-03-23 00:04:49 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 97,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# comparison with bias of the train set\n",
2024-03-28 11:37:23 +01:00
"X_train_score = model.predict_proba(X_train)[:, 1]\n",
2024-03-23 00:04:49 +01:00
"\n",
"bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), \n",
" y_objective = y_train[\"y_has_purchased\"].sum(),\n",
2024-03-30 12:00:49 +01:00
" initial_guess=10)\n",
2024-03-23 00:04:49 +01:00
"bias_train_set"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 98,
2024-03-23 00:04:49 +01:00
"id": "fff6cbe6-7bb3-4732-9b81-b9ac5383bbcf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-03-30 12:00:49 +01:00
"betâ test - betâ train = 0.00094118290869078\n"
2024-03-23 00:04:49 +01:00
]
}
],
"source": [
"print(\"betâ test - betâ train = \",np.log(bias_test_set/bias_train_set))"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 99,
2024-03-23 00:04:49 +01:00
"id": "f506870d-4a8a-4b2c-8f0b-e0789080b20c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-03-30 12:00:49 +01:00
"mean absolute erreur 4.674943825828751e-05\n"
2024-03-23 00:04:49 +01:00
]
}
],
"source": [
"# impact of considering a bias computed on train set instead of test set - totally neglectable\n",
"\n",
"score_adjusted_test = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"\n",
"print(\"mean absolute erreur\",abs(score_adjusted_test-score_adjusted_train).mean())"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 100,
2024-03-23 00:04:49 +01:00
"id": "8213d0e4-063b-49fa-90b7-677fc34f4c01",
"metadata": {},
2024-03-30 12:00:49 +01:00
"outputs": [],
2024-03-23 17:23:59 +01:00
"source": [
"# adjust scores accordingly \n",
2024-03-23 00:04:49 +01:00
"\n",
2024-03-23 17:23:59 +01:00
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"\n",
"# actually, we are not supposed to have X_test, so the biais is estimated on X_train\n",
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"X_test_segment[\"score_adjusted\"] = score_adjusted_train"
2024-03-23 00:04:49 +01:00
]
},
{
2024-03-23 17:23:59 +01:00
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 101,
2024-03-23 17:23:59 +01:00
"id": "834d3723-2e72-4c65-9c62-e2d595c69461",
2024-03-23 00:04:49 +01:00
"metadata": {},
2024-03-23 17:23:59 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-03-30 12:00:49 +01:00
"MSE for score : 0.12309116071575532\n",
"MSE for ajusted score : 0.05482346713233594\n",
"sum of y_has_purchased : 13361.0\n",
"sum of adjusted scores : 13368.100024185826\n"
2024-03-23 17:23:59 +01:00
]
}
],
2024-03-23 00:04:49 +01:00
"source": [
2024-03-23 17:23:59 +01:00
"# check \n",
"\n",
"MSE_score = ((X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"MSE_ajusted_score = ((X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"print(f\"MSE for score : {MSE_score}\")\n",
"print(f\"MSE for ajusted score : {MSE_ajusted_score}\")\n",
"\n",
"print(\"sum of y_has_purchased :\",y_test[\"y_has_purchased\"].sum())\n",
"print(\"sum of adjusted scores :\", X_test_segment[\"score_adjusted\"].sum())"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 102,
2024-03-23 17:23:59 +01:00
"id": "9f30a4dd-a9d8-405a-a7d5-5324ae88cf70",
2024-03-23 00:04:49 +01:00
"metadata": {},
"outputs": [
{
2024-03-23 17:23:59 +01:00
"name": "stdout",
"output_type": "stream",
"text": [
2024-03-30 12:00:49 +01:00
"MAE for score : 0.25695361997840177\n",
"MAE for adjusted score : 0.10450649550597542\n"
2024-03-23 17:23:59 +01:00
]
2024-03-23 00:04:49 +01:00
}
],
"source": [
2024-03-23 17:23:59 +01:00
"# mean absolute error - divided by 2 with out method\n",
"\n",
"MAE_score = abs(X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"MAE_ajusted_score = abs(X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"print(f\"MAE for score : {MAE_score}\")\n",
"print(f\"MAE for adjusted score : {MAE_ajusted_score}\")"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 37,
2024-03-23 17:23:59 +01:00
"id": "6f9396db-e213-408c-a596-eaeec3bc79f3",
2024-03-23 00:04:49 +01:00
"metadata": {},
2024-03-24 11:05:28 +01:00
"outputs": [],
2024-03-23 00:04:49 +01:00
"source": [
2024-03-23 17:23:59 +01:00
"# visualization\n",
"\n",
"# histogramme des probas et des probas ajustées\n",
"\n",
2024-03-24 10:42:44 +01:00
"def plot_hist_scores(df, score, score_adjusted, type_of_activity) :\n",
2024-03-23 17:23:59 +01:00
"\n",
" plt.figure()\n",
" plt.hist(df[score], label = \"score\", alpha=0.6)\n",
" plt.hist(df[score_adjusted], label=\"adjusted score\", alpha=0.6)\n",
" plt.legend()\n",
" plt.xlabel(\"probability of a future purchase\")\n",
" plt.ylabel(\"count\")\n",
2024-03-24 10:42:44 +01:00
" plt.title(f\"Comparison between score and adjusted score for {type_of_activity} companies\")\n",
2024-03-24 11:44:22 +01:00
" # plt.show()"
2024-03-24 10:42:44 +01:00
]
},
{
"cell_type": "code",
2024-03-24 11:44:22 +01:00
"execution_count": 64,
"id": "def64c16-f4dd-493c-909c-d886d7f53947",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'projet-bdc2324-team1/Output_expected_CA/sport/hist_score_adjustedsport.png'"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PATH + file_name + type_of_activity + \".png\""
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 103,
2024-03-24 10:42:44 +01:00
"id": "b478d40d-9677-4204-87bd-16fb0bc1fe9a",
"metadata": {},
"outputs": [
{
"data": {
2024-03-30 12:00:49 +01:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAoYAAAHFCAYAAABvrjgmAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABq2klEQVR4nO3deVgVZfsH8O9hX5Qj+yZupSiCK6lIiiu4oFmZFoWihgsqorjWa6L5Su6mlpZvLqmFlWKaSuCGIqKI4opLhYIJYsqiqKzP7w9hfg0HEJBVv5/r4qp55p6Ze54zZ87tM2fmKIQQAkRERET0ylOr6QSIiIiIqHZgYUhEREREAFgYEhEREVEBFoZEREREBICFIREREREVYGFIRERERABYGBIRERFRARaGRERERASAhSERERERFajzheGFCxcwatQoNG3aFDo6OqhXrx46dOiAJUuW4MGDBzWdXpXz8vJCkyZNajqNYm3evBkKhQJnzpyptHVGRkYiICAAaWlplbZOqtuOHj0KhUKBo0ePVvt2qvr998MPP2DVqlVVsu4mTZrAy8urStZdG928eRMDBw6EkZERFAoF/Pz8ajqlalN4Lr5582ZNp0JlUF3ntJLU6cJww4YN6NixI6KjozFjxgyEhIQgODgY7733HtavX48xY8bUdIpVbu7cuQgODq7pNKpNZGQk5s+fz8KQaoWqfv9VZWH4qpk6dSpOnTqFjRs34uTJk5g6dWpNp1RtBg4ciJMnT8LS0rKmU6Ey6NChA06ePIkOHTrUyPY1amSrleDkyZOYMGEC+vbti927d0NbW1ua17dvX/j7+yMkJKQGM6xajx8/hp6eHl577bWaToVeMkIIPH36FLq6ujWdSq3H91/Vy8vLQ25uruwcXxGXLl1Cp06dMGTIkFqVV3UwNTWFqalpTadBZWRgYIAuXbrU2Pbr7IjhokWLoFAo8O233xb7xtTS0sLgwYOl6fz8fCxZsgQtW7aEtrY2zMzMMGLECNy+fVu2XI8ePWBvb4+TJ0+ia9eu0NXVRZMmTbBp0yYAwL59+9ChQwfo6enBwcFBpfgMCAiAQqHAuXPn8M4778DAwABKpRIfffQR7t27J4vdsWMHXF1dYWlpCV1dXbRq1QqzZ89GZmamLM7Lywv16tXDxYsX4erqivr166N3797SvKKXsn7++Wd07twZSqUSenp6aNasGUaPHi2LSUhIwEcffQQzMzNoa2ujVatWWL58OfLz86WYmzdvQqFQYNmyZVixYgWaNm2KevXqwcnJCVFRUaW9PDKpqakYNWoUjIyMoK+vj0GDBuGvv/5SiTt48CB69+4NAwMD6OnpwdnZGYcOHZL17YwZMwAATZs2hUKhkIbbZ8yYAaVSiby8PCl+8uTJUCgUWLp0qdR2//59qKmpYc2aNVJbRkYGpk+fjqZNm0JLSwvW1tbw8/NTeR2EEPj666/Rrl076OrqwtDQEEOHDlXZl8JjKDo6Gt26dZNegy+++ELWvyUpy+uXlpYGf39/NGvWTDqeBwwYgKtXr0oxDx48gI+PD6ytraGlpYVmzZrh008/RVZWlmxdCoUCkyZNwvr169GqVStoa2tjy5YtAIAbN27Aw8NDdpx89dVXz90HAPjqq6/QvXt3mJmZQV9fHw4ODliyZAlycnIq3F9Xr15Fv379oKenBxMTE4wfPx4PHz4sUz5//PEHRo0ahebNm0NPTw/W1tYYNGgQLl68qBJb1u0Uff8Vvmc2b96sEqtQKBAQECBN37t3D2PHjoWNjQ20tbVhamoKZ2dnHDx4UOqXffv24datW9KxrlAopOWzs7OxcOFC6ZxmamqKUaNGqZxncnJyMHPmTFhYWEBPTw9vvvkmTp8+XaY+A4B169ahbdu2qFevHurXr4+WLVvik08+kcX8/fff0r5oaWnBysoKQ4cOxd27d6WY8pxzlixZgoULF6Jp06bQ1tbGkSNHAABnzpzB4MGDYWRkBB0dHbRv3x4//fRTqfkXXpb7448/cODAAakfCy+rVkZexSl8X23atAm2trbQ1dWFo6MjoqKiIITA0qVLpXNqr1698Mcff8iWL+lSf48ePdCjRw9pOj8/HwsXLpS20aBBA7Rp0wZffvmlFFPcpWQhBJYsWYLGjRtDR0cHHTp0wIEDB1TWX9Jl6JIudz7vPF6aqjivVbT/C89Lx48fR5cuXaCrqwtra2vMnTtX9jkDAPPnz0fnzp1hZGQEAwMDdOjQAd999x2EELK4Jk2awN3dHSEhIejQoQN0dXXRsmVLbNy4sUx9W5bj//Hjx9LnmY6ODoyMjODo6Igff/yxTK8BAEDUQbm5uUJPT0907ty5zMuMHTtWABCTJk0SISEhYv369cLU1FTY2NiIe/fuSXEuLi7C2NhY2Nraiu+++078/vvvwt3dXQAQ8+fPFw4ODuLHH38U+/fvF126dBHa2tri77//lpafN2+eACAaN24sZsyYIX7//XexYsUKoa+vL9q3by+ys7Ol2M8//1ysXLlS7Nu3Txw9elSsX79eNG3aVPTs2VOW+8iRI4WmpqZo0qSJCAwMFIcOHRK///67NK9x48ZSbGRkpFAoFOL9998X+/fvF4cPHxabNm0Snp6eUkxKSoqwtrYWpqamYv369SIkJERMmjRJABATJkyQ4uLj4wUA0aRJE9GvXz+xe/dusXv3buHg4CAMDQ1FWlpaqX2+adMmAUDY2NiI0aNHiwMHDohvv/1WmJmZCRsbG5GamirFbt26VSgUCjFkyBCxa9cusXfvXuHu7i7U1dXFwYMHhRBCJCYmismTJwsAYteuXeLkyZPi5MmTIj09XYSEhAgAIjIyUlpny5Ytha6urujbt6/UtmPHDgFAXLlyRQghRGZmpmjXrp0wMTERK1asEAcPHhRffvmlUCqVolevXiI/P19a1tvbW2hqagp/f38REhIifvjhB9GyZUthbm4ukpOTVY6h5s2bi/Xr14uwsDDh4+MjAIgtW7aU2mdlef0yMjJE69athb6+vliwYIH4/fffxc6dO8WUKVPE4cOHhRBCPHnyRLRp00bo6+uLZcuWidDQUDF37lyhoaEhBgwYINsmAGFtbS3atGkjfvjhB3H48GFx6dIlcfnyZaFUKoWDg4P4/vvvRWhoqPD39xdqamoiICCg1P0QQoipU6eKdevWiZCQEHH48GGxcuVKYWJiIkaNGiWLK2t/JScnCzMzM2FtbS02bdok9u/fLz788EPRqFEjAUAcOXKk1HzCw8OFv7+/+OWXX0R4eLgIDg4WQ4YMEbq6uuLq1asV2k7R91/he2bTpk0q2wcg5s2bJ027ubkJU1NT8e2334qjR4+K3bt3i88++0wEBQUJIYS4fPmycHZ2FhYWFtKxfvLkSSGEEHl5eaJfv35CX19fzJ8/X4SFhYn//e9/wtraWtjZ2YnHjx/LclQoFGLGjBkiNDRUrFixQlhbWwsDAwMxcuTIUvvsxx9/FADE5MmTRWhoqDh48KBYv3698PX1lWJu374tLC0tZe+hHTt2iNGjR4u4uDghRPnPOdbW1qJnz57il19+EaGhoSI+Pl4cPnxYaGlpiW7duokdO3aIkJAQ4eXlVWJ/F0pPTxcnT54UFhYWwtnZWerHp0+fVkpeJSn8HOjatavYtWuXCA4OFi1atBBGRkZi6tSp4q233hK//fab2L59uzA3Nxdt2rSRnW8aN25c7Ovj4uIiXFxcpOnAwEChrq4u5s2bJw4dOiRCQkLEqlWrZO/RwnPxv/Mt/KwaM2aMdG62trYWFhYWsvUXt6wQQhw5ckTl/VCW83hJquK89iL9X3hesrKyEqtXrxa///678PX1FQDExIkTZdvy8vIS3333nQgLCxNhYWHi888/F7q6umL+/PmyuMaNG4uGDRsKOzs78f3334vff/9dvPfeewKACA8PL7Vvy3r8jxs3Tujp6YkVK1aII0eOiN9++0188cUXYs2aNaX2v6zvyhxZiyQnJwsA4v333y9TfFxcnAAgfHx8ZO2nTp0SAMQnn3witbm4uAgA4syZM1Lb/fv3hbq6utDV1ZUVgbGxsQKAWL16tdR
2024-03-24 10:42:44 +01:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
2024-03-24 11:44:22 +01:00
"plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\", type_of_activity = type_of_activity)"
2024-03-23 00:04:49 +01:00
]
},
{
2024-03-24 10:42:44 +01:00
"cell_type": "code",
2024-03-24 11:05:28 +01:00
"execution_count": 40,
2024-03-24 10:42:44 +01:00
"id": "add631d7-0757-45a5-bb5b-f7f4b4baa961",
2024-03-23 00:04:49 +01:00
"metadata": {},
2024-03-24 10:42:44 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"projet-bdc2324-team1/Output_expected_CA/sport/\n"
]
}
],
2024-03-23 00:04:49 +01:00
"source": [
2024-03-24 10:42:44 +01:00
"# define path so save graphics\n",
"\n",
"# define type of activity \n",
"type_of_activity = \"sport\"\n",
"PATH = f\"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/\"\n",
"print(PATH)"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-03-24 11:44:22 +01:00
"execution_count": 68,
2024-03-24 10:42:44 +01:00
"id": "3a5b5bd9-e033-4436-8c56-bf5fb61df87f",
2024-03-23 00:04:49 +01:00
"metadata": {},
2024-03-24 11:44:22 +01:00
"outputs": [],
2024-03-23 17:23:59 +01:00
"source": [
2024-03-24 10:42:44 +01:00
"# export png \n",
"\n",
"# plot adjusted scores and save (to be tested)\n",
"plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\", type_of_activity = type_of_activity)\n",
"\n",
"image_buffer = io.BytesIO()\n",
"plt.savefig(image_buffer, format='png')\n",
"image_buffer.seek(0)\n",
2024-03-24 11:44:22 +01:00
"file_name = \"hist_score_adjusted_\"\n",
2024-03-24 10:42:44 +01:00
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".png\"\n",
"with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:\n",
" s3_file.write(image_buffer.read())\n",
"plt.close()"
2024-03-23 17:23:59 +01:00
]
},
2024-03-24 10:42:44 +01:00
{
"cell_type": "markdown",
"id": "e6fae260-fab8-4f51-90dc-9b6d7314c77b",
"metadata": {},
"source": [
"## Compute number of tickets and CA by segment with the recalibrated score"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 104,
2024-03-24 10:42:44 +01:00
"id": "90c4c2b5-0ede-4001-889f-749cfbd9df04",
2024-03-23 17:23:59 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
2024-03-23 00:04:49 +01:00
"\n",
2024-03-23 17:23:59 +01:00
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
2024-03-23 00:04:49 +01:00
"\n",
2024-03-23 17:23:59 +01:00
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
2024-03-24 10:42:44 +01:00
" <th>score (%)</th>\n",
" <th>score adjusted (%)</th>\n",
" <th>has purchased (%)</th>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>8.80</td>\n",
" <td>0.94</td>\n",
" <td>1.02</td>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
2024-03-30 12:00:49 +01:00
" <td>36.16</td>\n",
" <td>5.17</td>\n",
" <td>4.70</td>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
2024-03-30 12:00:49 +01:00
" <td>61.06</td>\n",
" <td>13.33</td>\n",
" <td>14.62</td>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
2024-03-30 12:00:49 +01:00
" <td>89.86</td>\n",
" <td>53.74</td>\n",
" <td>53.19</td>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-03-24 10:42:44 +01:00
" quartile score (%) score adjusted (%) has purchased (%)\n",
2024-03-30 12:00:49 +01:00
"0 1 8.80 0.94 1.02\n",
"1 2 36.16 5.17 4.70\n",
"2 3 61.06 13.33 14.62\n",
"3 4 89.86 53.74 53.19"
2024-03-23 00:04:49 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 104,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-03-24 10:42:44 +01:00
"X_test_table_adjusted_scores = (100 * X_test_segment.groupby(\"quartile\")[[\"score\",\"score_adjusted\", \"has_purchased\"]].mean()).round(2).reset_index()\n",
"X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f\"{col.replace('_', ' ')} (%)\" for col in X_test_table_adjusted_scores.columns if col in [\"score\",\"score_adjusted\", \"has_purchased\"]})\n",
"X_test_table_adjusted_scores"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-03-24 10:42:44 +01:00
"execution_count": 162,
"id": "d0b8740c-cf48-4a3e-83cb-23d95059f62f",
2024-03-23 00:04:49 +01:00
"metadata": {},
"outputs": [
{
2024-03-23 17:23:59 +01:00
"data": {
"text/plain": [
2024-03-24 10:42:44 +01:00
"'\\\\begin{tabular}{lrrr}\\n\\\\toprule\\nquartile & score (%) & score adjusted (%) & has purchased (%) \\\\\\\\\\n\\\\midrule\\n1 & 13.250000 & 2.510000 & 1.570000 \\\\\\\\\\n2 & 33.890000 & 8.000000 & 9.850000 \\\\\\\\\\n3 & 63.060000 & 22.580000 & 21.470000 \\\\\\\\\\n4 & 90.520000 & 66.200000 & 65.010000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'"
2024-03-23 17:23:59 +01:00
]
},
2024-03-24 10:42:44 +01:00
"execution_count": 162,
2024-03-23 17:23:59 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-03-24 10:42:44 +01:00
"X_test_table_adjusted_scores.to_latex(index=False)"
]
},
{
"cell_type": "code",
2024-03-24 11:05:28 +01:00
"execution_count": 43,
2024-03-24 10:42:44 +01:00
"id": "d6a04d3e-c454-43e4-ae4c-0746e928575b",
"metadata": {},
2024-03-24 11:05:28 +01:00
"outputs": [],
2024-03-24 10:42:44 +01:00
"source": [
"# comparison between score and adjusted score - export csv associated\n",
"\n",
2024-03-26 12:20:03 +01:00
"file_name = \"table_adjusted_score_\"\n",
2024-03-24 10:42:44 +01:00
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" X_test_table_adjusted_scores.to_csv(file_out, index = False)"
2024-03-23 17:23:59 +01:00
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 40,
2024-03-23 17:23:59 +01:00
"id": "a974589f-7952-4db2-bebf-7b69c6b09372",
"metadata": {},
"outputs": [],
"source": [
2024-03-26 12:20:03 +01:00
"def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :\n",
2024-03-23 17:23:59 +01:00
" \n",
" duration_ratio = duration_ref/duration_projection\n",
"\n",
" df_output = df\n",
"\n",
" df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n",
" df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n",
" \n",
" df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n",
" df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n",
"\n",
2024-03-26 12:20:03 +01:00
" df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n",
" \n",
2024-03-23 17:23:59 +01:00
" return df_output\n"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 41,
2024-03-23 17:23:59 +01:00
"id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e",
"metadata": {},
"outputs": [
2024-03-23 00:04:49 +01:00
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-03-30 12:00:49 +01:00
" <th>customer_id</th>\n",
" <th>street_id</th>\n",
" <th>structure_id</th>\n",
" <th>mcp_contact_id</th>\n",
2024-03-23 00:04:49 +01:00
" <th>fidelity</th>\n",
2024-03-30 12:00:49 +01:00
" <th>tenant_id</th>\n",
" <th>is_partner</th>\n",
" <th>deleted_at</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
2024-03-23 00:04:49 +01:00
" <th>...</th>\n",
" <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>score_adjusted</th>\n",
" <th>nb_tickets_projected</th>\n",
" <th>total_amount_projected</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
2024-03-26 12:20:03 +01:00
" <th>pace_purchase</th>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-03-30 12:00:49 +01:00
" <td>1_8191</td>\n",
" <td>8114</td>\n",
" <td>NaN</td>\n",
" <td>834.0</td>\n",
" <td>0</td>\n",
" <td>1311</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.408546</td>\n",
" <td>2</td>\n",
" <td>0.027066</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-03-30 12:00:49 +01:00
" <td>1_14792</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>251178.0</td>\n",
" <td>0</td>\n",
" <td>1311</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.027046</td>\n",
" <td>1</td>\n",
" <td>0.001118</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-03-30 12:00:49 +01:00
" <td>1_30466</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>2355.0</td>\n",
" <td>0</td>\n",
" <td>1311</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.180851</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.008813</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-03-30 12:00:49 +01:00
" <td>1_41898</td>\n",
" <td>20244</td>\n",
" <td>203714.0</td>\n",
" <td>97973.0</td>\n",
" <td>0</td>\n",
" <td>1311</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.220872</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.011288</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-03-30 12:00:49 +01:00
" <td>1_58746</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>82026.0</td>\n",
" <td>1</td>\n",
" <td>1311</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.100951</td>\n",
" <td>1</td>\n",
" <td>0.004502</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2024-03-30 12:00:49 +01:00
" <th>186115</th>\n",
" <td>4_24295</td>\n",
" <td>103884</td>\n",
" <td>NaN</td>\n",
" <td>96913.0</td>\n",
" <td>0</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.466644</td>\n",
2024-03-23 00:04:49 +01:00
" <td>2</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.034037</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.000000</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-03-30 12:00:49 +01:00
" <th>186116</th>\n",
" <td>4_44443</td>\n",
" <td>43315</td>\n",
" <td>NaN</td>\n",
" <td>234734.0</td>\n",
" <td>0</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.427641</td>\n",
" <td>2</td>\n",
" <td>0.029211</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.000000</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
2024-03-26 12:20:03 +01:00
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-03-30 12:00:49 +01:00
" <th>186117</th>\n",
" <td>4_3343947</td>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.468464</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.034278</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.705882</td>\n",
2024-03-30 12:00:49 +01:00
" <td>20.470588</td>\n",
" <td>0.024196</td>\n",
" <td>0.701686</td>\n",
2024-03-28 11:37:23 +01:00
" <td>17.0</td>\n",
" </tr>\n",
" <tr>\n",
2024-03-30 12:00:49 +01:00
" <th>186118</th>\n",
" <td>4_47752</td>\n",
" <td>46460</td>\n",
" <td>NaN</td>\n",
" <td>89791.0</td>\n",
" <td>0</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.360100</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.022161</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>186119</th>\n",
" <td>4_35449</td>\n",
" <td>34592</td>\n",
" <td>NaN</td>\n",
" <td>119197.0</td>\n",
" <td>0</td>\n",
" <td>1342</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.728907</td>\n",
" <td>3</td>\n",
" <td>0.097705</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.000000</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
2024-03-26 12:20:03 +01:00
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-03-30 12:00:49 +01:00
"<p>186120 rows × 97 columns</p>\n",
2024-03-23 00:04:49 +01:00
"</div>"
],
"text/plain": [
2024-03-30 12:00:49 +01:00
" customer_id street_id structure_id mcp_contact_id fidelity \\\n",
"0 1_8191 8114 NaN 834.0 0 \n",
"1 1_14792 2 NaN 251178.0 0 \n",
"2 1_30466 2 NaN 2355.0 0 \n",
"3 1_41898 20244 203714.0 97973.0 0 \n",
"4 1_58746 2 NaN 82026.0 1 \n",
"... ... ... ... ... ... \n",
"186115 4_24295 103884 NaN 96913.0 0 \n",
"186116 4_44443 43315 NaN 234734.0 0 \n",
"186117 4_3343947 2 NaN NaN 1 \n",
"186118 4_47752 46460 NaN 89791.0 0 \n",
"186119 4_35449 34592 NaN 119197.0 0 \n",
"\n",
" tenant_id is_partner deleted_at is_email_true opt_in ... \\\n",
"0 1311 False NaN True 1 ... \n",
"1 1311 False NaN True 1 ... \n",
"2 1311 False NaN True 1 ... \n",
"3 1311 False NaN True 1 ... \n",
"4 1311 False NaN True 1 ... \n",
"... ... ... ... ... ... ... \n",
"186115 1342 False NaN True 1 ... \n",
"186116 1342 False NaN True 0 ... \n",
"186117 1342 False NaN True 0 ... \n",
"186118 1342 False NaN True 1 ... \n",
"186119 1342 False NaN True 1 ... \n",
"\n",
" has_purchased has_purchased_estim score quartile score_adjusted \\\n",
"0 0.0 0.0 0.408546 2 0.027066 \n",
"1 0.0 0.0 0.027046 1 0.001118 \n",
"2 0.0 0.0 0.180851 1 0.008813 \n",
"3 0.0 0.0 0.220872 1 0.011288 \n",
"4 0.0 0.0 0.100951 1 0.004502 \n",
"... ... ... ... ... ... \n",
"186115 0.0 0.0 0.466644 2 0.034037 \n",
"186116 0.0 0.0 0.427641 2 0.029211 \n",
"186117 0.0 0.0 0.468464 2 0.034278 \n",
"186118 0.0 0.0 0.360100 2 0.022161 \n",
"186119 0.0 1.0 0.728907 3 0.097705 \n",
"\n",
" nb_tickets_projected total_amount_projected nb_tickets_expected \\\n",
"0 0.000000 0.000000 0.000000 \n",
"1 0.000000 0.000000 0.000000 \n",
"2 0.000000 0.000000 0.000000 \n",
"3 0.000000 0.000000 0.000000 \n",
"4 0.000000 0.000000 0.000000 \n",
"... ... ... ... \n",
"186115 0.000000 0.000000 0.000000 \n",
"186116 0.000000 0.000000 0.000000 \n",
"186117 0.705882 20.470588 0.024196 \n",
"186118 0.000000 0.000000 0.000000 \n",
"186119 0.000000 0.000000 0.000000 \n",
"\n",
" total_amount_expected pace_purchase \n",
"0 0.000000 NaN \n",
"1 0.000000 NaN \n",
"2 0.000000 NaN \n",
"3 0.000000 NaN \n",
"4 0.000000 NaN \n",
"... ... ... \n",
"186115 0.000000 NaN \n",
"186116 0.000000 NaN \n",
"186117 0.701686 17.0 \n",
"186118 0.000000 NaN \n",
"186119 0.000000 NaN \n",
"\n",
"[186120 rows x 97 columns]"
2024-03-23 00:04:49 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 41,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-03-26 12:20:03 +01:00
"X_test_segment = project_tickets_CA (X_test_segment, \"nb_purchases\", \"nb_tickets\", \"total_amount\", \"score_adjusted\", \n",
" duration_ref=17, duration_projection=12)\n",
2024-03-23 00:04:49 +01:00
"X_test_segment"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 42,
2024-03-26 12:20:03 +01:00
"id": "cb66a8ea-65f7-460f-b3fc-ba76a3b91faa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
2024-03-30 12:00:49 +01:00
"1 16.722853\n",
"2 16.568788\n",
"3 15.765899\n",
"4 13.263500\n",
2024-03-26 12:20:03 +01:00
"Name: pace_purchase, dtype: float64"
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 42,
2024-03-26 12:20:03 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment.groupby(\"quartile\")[\"pace_purchase\"].mean()"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 43,
2024-03-23 17:23:59 +01:00
"id": "f58f9151-2f91-45df-abb7-1ddcf0652adc",
"metadata": {},
"outputs": [],
"source": [
"# generalization with a function\n",
"\n",
2024-03-26 12:20:03 +01:00
"def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n",
2024-03-27 19:59:05 +01:00
" duration_ref=17, duration_projection=12) :\n",
2024-03-23 17:23:59 +01:00
" \n",
" # compute nb tickets estimated and total amount expected\n",
" df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n",
" \n",
" # number of customers by segment\n",
" df_expected_CA.insert(1, \"size\", df.groupby(segment).size().values)\n",
" \n",
" # size in percent of all customers\n",
" df_expected_CA.insert(2, \"size_perct\", 100 * df_expected_CA[\"size\"]/df_expected_CA[\"size\"].sum())\n",
" \n",
" # compute share of CA recovered\n",
" duration_ratio=duration_ref/duration_projection\n",
" \n",
2024-03-24 10:42:44 +01:00
" df_expected_CA[\"revenue_recovered_perct\"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \\\n",
2024-03-23 17:23:59 +01:00
" df.groupby(segment)[total_amount].sum().values\n",
2024-03-26 12:20:03 +01:00
"\n",
" df_drop_null_pace = df.dropna(subset=[pace_purchase])\n",
" df_expected_CA[\"pace_purchase\"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values\n",
2024-03-23 17:23:59 +01:00
" \n",
" return df_expected_CA"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 44,
2024-03-23 17:23:59 +01:00
"id": "c8df6c80-43e8-4f00-9cd3-eb9022744313",
2024-03-23 00:04:49 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
2024-03-23 10:18:43 +01:00
" <th>size</th>\n",
" <th>size_perct</th>\n",
2024-03-23 00:04:49 +01:00
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
2024-03-24 10:42:44 +01:00
" <th>revenue_recovered_perct</th>\n",
2024-03-26 12:20:03 +01:00
" <th>pace_purchase</th>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>81622</td>\n",
" <td>43.85</td>\n",
" <td>263.12</td>\n",
" <td>3258.54</td>\n",
" <td>0.88</td>\n",
" <td>16.72</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
2024-03-30 12:00:49 +01:00
" <td>60811</td>\n",
" <td>32.67</td>\n",
" <td>1984.56</td>\n",
" <td>27052.82</td>\n",
" <td>2.47</td>\n",
" <td>16.57</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
2024-03-30 12:00:49 +01:00
" <td>28913</td>\n",
" <td>15.53</td>\n",
" <td>3476.63</td>\n",
" <td>43945.79</td>\n",
" <td>6.34</td>\n",
" <td>15.77</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
2024-03-30 12:00:49 +01:00
" <td>14774</td>\n",
" <td>7.94</td>\n",
" <td>58598.68</td>\n",
" <td>523568.93</td>\n",
" <td>60.03</td>\n",
" <td>13.26</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile size size_perct nb_tickets_expected total_amount_expected \\\n",
2024-03-30 12:00:49 +01:00
"0 1 81622 43.85 263.12 3258.54 \n",
"1 2 60811 32.67 1984.56 27052.82 \n",
"2 3 28913 15.53 3476.63 43945.79 \n",
"3 4 14774 7.94 58598.68 523568.93 \n",
2024-03-26 12:20:03 +01:00
"\n",
" revenue_recovered_perct pace_purchase \n",
2024-03-30 12:00:49 +01:00
"0 0.88 16.72 \n",
"1 2.47 16.57 \n",
"2 6.34 15.77 \n",
"3 60.03 13.26 "
2024-03-23 00:04:49 +01:00
]
},
2024-03-30 12:00:49 +01:00
"execution_count": 44,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-03-26 12:20:03 +01:00
"X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment=\"quartile\", \n",
" nb_tickets_expected=\"nb_tickets_expected\", total_amount_expected=\"total_amount_expected\", \n",
" total_amount=\"total_amount\", pace_purchase=\"pace_purchase\"),2)\n",
2024-03-23 00:04:49 +01:00
"\n",
"X_test_expected_CA"
]
},
2024-03-23 10:18:43 +01:00
{
"cell_type": "code",
2024-03-27 18:58:30 +01:00
"execution_count": 64,
2024-03-23 17:23:59 +01:00
"id": "ac706ed7-defa-4df1-82e1-06f12fc1b6ad",
2024-03-23 10:18:43 +01:00
"metadata": {},
2024-03-23 17:23:59 +01:00
"outputs": [
{
"data": {
"text/plain": [
2024-03-27 18:58:30 +01:00
"'\\\\begin{tabular}{lrrrrrr}\\n\\\\toprule\\nquartile & size & size (%) & nb tickets expected & total amount expected & revenue recovered (%) & pace purchase \\\\\\\\\\n\\\\midrule\\n1 & 53626 & 35.310000 & 398.260000 & 13949.330000 & 2.350000 & 16.480000 \\\\\\\\\\n2 & 55974 & 36.860000 & 3113.770000 & 101639.450000 & 6.240000 & 16.470000 \\\\\\\\\\n3 & 30435 & 20.040000 & 6214.350000 & 208267.220000 & 14.270000 & 15.710000 \\\\\\\\\\n4 & 11839 & 7.800000 & 72929.460000 & 1835702.430000 & 75.380000 & 11.480000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'"
2024-03-23 17:23:59 +01:00
]
},
2024-03-27 18:58:30 +01:00
"execution_count": 64,
2024-03-23 17:23:59 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
2024-03-23 10:18:43 +01:00
"source": [
2024-03-24 10:42:44 +01:00
"# Création du dictionnaire de mapping pour les noms de colonnes\n",
"mapping_dict = {col: col.replace(\"perct\", \"(%)\").replace(\"_\", \" \") for col in X_test_expected_CA.columns}\n",
"\n",
"X_test_expected_CA.rename(columns=mapping_dict).to_latex(index=False)"
]
},
{
"cell_type": "code",
2024-03-26 12:20:03 +01:00
"execution_count": 122,
2024-03-24 10:42:44 +01:00
"id": "771da0cf-c49f-4e7e-b52f-ebcfb0fb2df3",
"metadata": {},
"outputs": [],
"source": [
"# export summary table to the MinIO storage\n",
"\n",
2024-03-26 12:20:03 +01:00
"file_name = \"table_expected_CA_\"\n",
2024-03-24 10:42:44 +01:00
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" X_test_expected_CA.to_csv(file_out, index = False)"
2024-03-23 10:18:43 +01:00
]
},
2024-03-27 18:58:30 +01:00
{
"cell_type": "code",
"execution_count": 53,
"id": "c805dc10-4d07-4f7d-a677-5461a92845d7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'projet-bdc2324-team1/Output_expected_CA/musique/table_expected_CA_musique.csv'"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PATH = f\"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/\"\n",
"file_name = \"table_expected_CA_\"\n",
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n",
"FILE_PATH_OUT_S3"
]
},
{
"cell_type": "markdown",
"id": "e35ccfff-1845-41f0-9bde-f09b09b67877",
"metadata": {},
"source": [
"## Test : vizu tables saved"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "4e9e88e4-ea10-41f4-9bf1-20b55269a20d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
" <th>score (%)</th>\n",
" <th>score adjusted (%)</th>\n",
" <th>has purchased (%)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13.25</td>\n",
" <td>2.51</td>\n",
" <td>1.57</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>33.89</td>\n",
" <td>8.00</td>\n",
" <td>9.85</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>63.06</td>\n",
" <td>22.58</td>\n",
" <td>21.47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>90.52</td>\n",
" <td>66.20</td>\n",
" <td>65.01</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile score (%) score adjusted (%) has purchased (%)\n",
"0 1 13.25 2.51 1.57\n",
"1 2 33.89 8.00 9.85\n",
"2 3 63.06 22.58 21.47\n",
"3 4 90.52 66.20 65.01"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = 'projet-bdc2324-team1/Output_expected_CA/sport/table_adjusted_scoresport.csv'\n",
"\n",
"with fs.open( path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
"df"
]
},
2024-03-23 00:04:49 +01:00
{
"cell_type": "markdown",
"id": "9c471bdd-25c2-420a-a8a1-3add9f003cbc",
"metadata": {},
"source": [
"## Just to try, same computation with score instead of score adjusted\n",
"\n",
"seems overestimated : if only 14% of customers come back, how can we recover 22% of the revenue from the segment that is least likely to buy ?? ..."
]
},
{
"cell_type": "code",
2024-03-23 17:23:59 +01:00
"execution_count": 80,
2024-03-23 00:04:49 +01:00
"id": "53684a24-1809-465f-8e21-b9295e34582a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
" <th>size</th>\n",
" <th>size_perct</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" <th>perct_revenue_recovered</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>37410</td>\n",
2024-03-23 17:23:59 +01:00
" <td>38.93</td>\n",
" <td>419.76</td>\n",
" <td>9245.08</td>\n",
" <td>21.71</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>29517</td>\n",
2024-03-23 17:23:59 +01:00
" <td>30.72</td>\n",
" <td>11549.06</td>\n",
" <td>296522.02</td>\n",
" <td>39.24</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>20137</td>\n",
2024-03-23 17:23:59 +01:00
" <td>20.96</td>\n",
" <td>29997.85</td>\n",
" <td>954751.91</td>\n",
" <td>63.34</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>9032</td>\n",
2024-03-23 17:23:59 +01:00
" <td>9.40</td>\n",
" <td>244655.82</td>\n",
" <td>10736011.95</td>\n",
" <td>97.72</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile size size_perct nb_tickets_expected total_amount_expected \\\n",
2024-03-23 17:23:59 +01:00
"0 1 37410 38.93 419.76 9245.08 \n",
"1 2 29517 30.72 11549.06 296522.02 \n",
"2 3 20137 20.96 29997.85 954751.91 \n",
"3 4 9032 9.40 244655.82 10736011.95 \n",
2024-03-23 00:04:49 +01:00
"\n",
" perct_revenue_recovered \n",
2024-03-23 17:23:59 +01:00
"0 21.71 \n",
"1 39.24 \n",
"2 63.34 \n",
"3 97.72 "
2024-03-23 00:04:49 +01:00
]
},
2024-03-23 17:23:59 +01:00
"execution_count": 80,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment_bis = project_tickets_CA (X_test_segment, \"nb_tickets\", \"total_amount\", \"score\", duration_ref=1.5, duration_projection=1)\n",
"\n",
2024-03-23 17:23:59 +01:00
"X_test_expected_CA_bis = round(summary_expected_CA(df=X_test_segment_bis, segment=\"quartile\", nb_tickets_expected=\"nb_tickets_expected\", \n",
" total_amount_expected=\"total_amount_expected\", total_amount=\"total_amount\"),2)\n",
2024-03-23 00:04:49 +01:00
"\n",
"X_test_expected_CA_bis"
]
},
{
"cell_type": "code",
2024-03-23 17:23:59 +01:00
"execution_count": 81,
2024-03-23 00:04:49 +01:00
"id": "7dc66d1e-da03-4513-96e4-d9a43ac0a2c8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"overall share of revenue recovered : 90.26 %\n"
]
}
],
"source": [
"print(\"overall share of revenue recovered : \", round(100 * duration_ratio * X_test_expected_CA_bis[\"total_amount_expected\"].sum() / \\\n",
"X_test_segment_bis[\"total_amount\"].sum(),2), \"%\")"
]
},
{
"cell_type": "markdown",
"id": "673f2969-7b9a-44c1-abf5-5679fca877ce",
"metadata": {},
"source": [
"## Last pieces of analysis"
]
},
{
"cell_type": "code",
"execution_count": 161,
"id": "2365bb13-0f3f-49d5-bf91-52c92abebcee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"overall share of revenue recovered : 77.64%\n"
]
}
],
"source": [
"# global revenue recovered\n",
"global_revenue_recovered = round(100 * duration_ratio * X_test_expected_CA[\"total_amount_expected\"].sum() / \\\n",
"X_test_segment[\"total_amount\"].sum(),2)\n",
"print(f\"overall share of revenue recovered : {global_revenue_recovered}%\")"
]
},
{
"cell_type": "code",
"execution_count": 163,
"id": "16b17f35-57dd-459a-8989-129143dc0952",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.018093\n",
"1 0.721519\n",
"2 3.336101\n",
"3 95.924287\n",
"Name: total_amount_expected, dtype: float64"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"100 * X_test_expected_CA[\"total_amount_expected\"]/X_test_expected_CA[\"total_amount_expected\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": 166,
"id": "dee4a200-eefe-4377-8e80-59ad33edd3c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
"1 0.320407\n",
"2 5.685020\n",
"3 11.339715\n",
"4 82.654858\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# le segment 4 représente 83% du CA actuel et 96% du CA lié aux anciens clients pour l'année prochaine\n",
"100 * X_test_segment.groupby(\"quartile\")[\"total_amount\"].sum()/X_test_segment[\"total_amount\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": 177,
"id": "c1e6f020-ef18-40b4-bfc1-19f98cb2796e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 207.475735\n",
"std 4720.046248\n",
"min -48831.800000\n",
"25% 0.000000\n",
"50% 0.000000\n",
"75% 60.000000\n",
"max 624890.000000\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 177,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[\"total_amount\"].describe() # total amount négatif ???\n"
]
},
{
"cell_type": "code",
"execution_count": 184,
"id": "d301a50e-7c68-40f0-9245-a4eea64c387b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 -4.883180e+04\n",
"1 -6.483180e+04\n",
"2 -7.683860e+04\n",
"3 -8.683860e+04\n",
"4 -9.683860e+04\n",
" ... \n",
"96091 1.802247e+07\n",
"96092 1.839238e+07\n",
"96093 1.877219e+07\n",
"96094 1.931270e+07\n",
"96095 1.993759e+07\n",
"Name: total_amount, Length: 96096, dtype: float64"
]
},
"execution_count": 184,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.cumsum(X_test_segment[\"total_amount\"].sort_values()).reset_index()[\"total_amount\"]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}