2024-03-23 00:04:49 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "84b6e27e-4bda-4d38-8689-ec7fc0da1848",
"metadata": {},
"source": [
"# Define segment and predict sales associated"
]
},
{
"cell_type": "markdown",
"id": "ec059482-45d3-4ae6-99bc-9b4ced115db3",
"metadata": {},
"source": [
"## Importations of packages "
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 48,
2024-03-23 00:04:49 +01:00
"id": "9771bf29-d08e-4674-8c23-9a2672fbef8f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
2024-03-23 10:18:43 +01:00
"from pandas import DataFrame\n",
2024-03-23 00:04:49 +01:00
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n",
"from sklearn.utils import class_weight\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
"from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from scipy.optimize import fsolve\n",
"\n",
"import pickle\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "048fcd7c-800a-4a6b-b725-faf8410f924a",
"metadata": {},
"source": [
"## load databases"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 3,
2024-03-23 00:04:49 +01:00
"id": "539ccbdf-f29f-4f04-99c1-8c88d0efe514",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 4,
2024-03-23 00:04:49 +01:00
"id": "0c3a6ddc-9345-4a42-b6bf-a20a95de3028",
"metadata": {},
"outputs": [],
"source": [
"def load_train_test():\n",
" BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n",
" File_path_train = BUCKET + \"/Train_set.csv\"\n",
" File_path_test = BUCKET + \"/Test_set.csv\"\n",
" \n",
" with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
"\n",
" with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
" \n",
" return dataset_train, dataset_test"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 5,
2024-03-23 00:04:49 +01:00
"id": "2831d546-b365-498b-8248-c618bd9c3057",
"metadata": {},
2024-03-23 10:18:43 +01:00
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_519/2459610029.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n"
]
},
{
"data": {
"text/plain": [
"customer_id 0\n",
"nb_tickets 0\n",
"nb_purchases 0\n",
"total_amount 0\n",
"nb_suppliers 0\n",
"vente_internet_max 0\n",
"purchase_date_min 0\n",
"purchase_date_max 0\n",
"time_between_purchase 0\n",
"nb_tickets_internet 0\n",
"street_id 0\n",
"structure_id 222825\n",
"mcp_contact_id 70874\n",
"fidelity 0\n",
"tenant_id 0\n",
"is_partner 0\n",
"deleted_at 224213\n",
"gender 0\n",
"is_email_true 0\n",
"opt_in 0\n",
"last_buying_date 66139\n",
"max_price 66139\n",
"ticket_sum 0\n",
"average_price 66023\n",
"average_purchase_delay 66139\n",
"average_price_basket 66139\n",
"average_ticket_basket 66139\n",
"total_price 116\n",
"purchase_count 0\n",
"first_buying_date 66139\n",
"country 23159\n",
"gender_label 0\n",
"gender_female 0\n",
"gender_male 0\n",
"gender_other 0\n",
"country_fr 23159\n",
"nb_campaigns 0\n",
"nb_campaigns_opened 0\n",
"time_to_open 123159\n",
"y_has_purchased 0\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
2024-03-23 00:04:49 +01:00
"source": [
"dataset_train, dataset_test = load_train_test()\n",
"dataset_train.isna().sum()"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 6,
2024-03-23 00:04:49 +01:00
"id": "b8827f7b-b304-4f51-9814-c7a98ed88cf0",
"metadata": {},
"outputs": [],
"source": [
"def features_target_split(dataset_train, dataset_test):\n",
" \n",
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n",
" 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n",
" 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n",
"\n",
" # we suppress fidelity, time between purchase, and gender other (colinearity issue)\n",
" \"\"\"\n",
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', \n",
" 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true', \n",
" 'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']\n",
" \"\"\"\n",
" \n",
" X_train = dataset_train[features_l]\n",
" y_train = dataset_train[['y_has_purchased']]\n",
"\n",
" X_test = dataset_test[features_l]\n",
" y_test = dataset_test[['y_has_purchased']]\n",
" return X_train, X_test, y_train, y_test"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 7,
2024-03-23 00:04:49 +01:00
"id": "c18195fc-ed40-4e39-a59e-c9ecc5a8e6c3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape train : (224213, 17)\n",
"Shape test : (96096, 17)\n"
]
}
],
"source": [
"X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)\n",
"print(\"Shape train : \", X_train.shape)\n",
"print(\"Shape test : \", X_test.shape)"
]
},
{
"cell_type": "markdown",
"id": "74eda066-5e01-43aa-b0cf-cc6d9bbf770e",
"metadata": {},
"source": [
"## get results from the logit cross validated model"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 8,
2024-03-23 00:04:49 +01:00
"id": "7c81390e-598c-4f02-bd56-dd03b00dcb33",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>fidelity</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>363.061678</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>15.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>12.0</td>\n",
" <td>9.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>-1.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>29.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>20.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>-1.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>31.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 17 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" time_between_purchase nb_tickets_internet fidelity is_email_true \\\n",
"0 0.000000 0.0 1 True \n",
"1 0.000000 0.0 2 True \n",
"2 0.000000 0.0 2 True \n",
"3 0.000000 0.0 1 True \n",
"4 363.061678 0.0 4 True \n",
"... ... ... ... ... \n",
"96091 0.000000 1.0 2 True \n",
"96092 0.000000 1.0 1 True \n",
"96093 -1.000000 0.0 1 True \n",
"96094 0.000000 1.0 1 True \n",
"96095 -1.000000 0.0 2 True \n",
"\n",
" opt_in gender_female gender_male gender_other nb_campaigns \\\n",
"0 False 1 0 0 0.0 \n",
"1 True 0 1 0 0.0 \n",
"2 True 1 0 0 0.0 \n",
"3 False 1 0 0 0.0 \n",
"4 False 1 0 0 0.0 \n",
"... ... ... ... ... ... \n",
"96091 False 0 1 0 15.0 \n",
"96092 False 0 1 0 12.0 \n",
"96093 True 1 0 0 29.0 \n",
"96094 False 0 1 0 20.0 \n",
"96095 False 0 1 0 31.0 \n",
"\n",
" nb_campaigns_opened \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 0.0 \n",
"4 0.0 \n",
"... ... \n",
"96091 5.0 \n",
"96092 9.0 \n",
"96093 3.0 \n",
"96094 4.0 \n",
"96095 4.0 \n",
"\n",
"[96096 rows x 17 columns]"
]
},
2024-03-23 10:18:43 +01:00
"execution_count": 8,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 9,
2024-03-23 00:04:49 +01:00
"id": "c708f439-bb75-4688-bf4f-4c04e13deaae",
"metadata": {},
"outputs": [],
"source": [
"def load_model(type_of_activity, model):\n",
" BUCKET = f\"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/\"\n",
" filename = model + '.pkl'\n",
" file_path = BUCKET + filename\n",
" with fs.open(file_path, mode=\"rb\") as f:\n",
" model_bytes = f.read()\n",
"\n",
" model = pickle.loads(model_bytes)\n",
" return model"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 10,
2024-03-23 00:04:49 +01:00
"id": "5261a803-05b8-41a0-968c-dc7bde48ddd3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
2024-03-23 10:18:43 +01:00
"<style>#sk-container-id-1 {\n",
2024-03-23 00:04:49 +01:00
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 pre {\n",
2024-03-23 00:04:49 +01:00
" padding: 0;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 input.sk-hidden--visually {\n",
2024-03-23 00:04:49 +01:00
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-dashed-wrapped {\n",
2024-03-23 00:04:49 +01:00
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-container {\n",
2024-03-23 00:04:49 +01:00
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-text-repr-fallback {\n",
2024-03-23 00:04:49 +01:00
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-parallel-item::after {\n",
2024-03-23 00:04:49 +01:00
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-parallel {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-parallel-item {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
2024-03-23 00:04:49 +01:00
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
2024-03-23 00:04:49 +01:00
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
2024-03-23 00:04:49 +01:00
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-serial {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-toggleable {\n",
2024-03-23 00:04:49 +01:00
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
2024-03-23 00:04:49 +01:00
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-toggleable__content {\n",
2024-03-23 00:04:49 +01:00
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-toggleable__content pre {\n",
2024-03-23 00:04:49 +01:00
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
2024-03-23 00:04:49 +01:00
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
2024-03-23 00:04:49 +01:00
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-1 div.sk-label label {\n",
2024-03-23 00:04:49 +01:00
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-label label {\n",
2024-03-23 00:04:49 +01:00
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-label-container {\n",
2024-03-23 00:04:49 +01:00
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-estimator {\n",
2024-03-23 00:04:49 +01:00
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-estimator.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-estimator:hover {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 a.estimator_doc_link {\n",
2024-03-23 00:04:49 +01:00
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 a.estimator_doc_link.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 a.estimator_doc_link:hover {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
2024-03-23 10:18:43 +01:00
"</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3, error_score='raise',\n",
2024-03-23 00:04:49 +01:00
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
2024-03-23 10:18:43 +01:00
" scoring=make_scorer(recall_score, response_method='predict'))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> GridSearchCV<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.model_selection.GridSearchCV.html\">?<span>Documentation for GridSearchCV</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>GridSearchCV(cv=3, error_score='raise',\n",
2024-03-23 00:04:49 +01:00
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
2024-03-23 10:18:43 +01:00
" scoring=make_scorer(recall_score, response_method='predict'))</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">estimator: Pipeline</label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[('preprocessor',\n",
2024-03-23 00:04:49 +01:00
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in', 'gender_male',\n",
" 'gender_female'])])),\n",
" ('LogisticRegression_cv',\n",
2024-03-23 10:18:43 +01:00
" LogisticRegression(max_iter=5000, solver='saga'))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>ColumnTransformer(transformers=[('num',\n",
2024-03-23 00:04:49 +01:00
" Pipeline(steps=[('scaler', StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases', 'total_amount',\n",
" 'nb_suppliers', 'vente_internet_max',\n",
" 'purchase_date_min', 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet', 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
2024-03-23 10:18:43 +01:00
" ['opt_in', 'gender_male', 'gender_female'])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">num</label><div class=\"sk-toggleable__content fitted\"><pre>['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> StandardScaler<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>StandardScaler()</pre></div> </div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">cat</label><div class=\"sk-toggleable__content fitted\"><pre>['opt_in', 'gender_male', 'gender_female']</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-7\" type=\"checkbox\" ><label for=\"sk-estimator-id-7\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> OneHotEncoder<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>OneHotEncoder(handle_unknown='ignore', sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-8\" type=\"checkbox\" ><label for=\"sk-estimator-id-8\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression(max_iter=5000, solver='saga')</pre></div> </div></div></div></div></div></div></div></div></div></div></div>"
2024-03-23 00:04:49 +01:00
],
"text/plain": [
"GridSearchCV(cv=3, error_score='raise',\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
" scoring=make_scorer(recall_score, response_method='predict'))"
]
},
2024-03-23 10:18:43 +01:00
"execution_count": 10,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logit_cv = load_model(\"sport\", \"LogisticRegression_cv\")\n",
"logit_cv"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 11,
2024-03-23 00:04:49 +01:00
"id": "6f3e584d-c70d-4b45-b947-4414ff416e17",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
2024-03-23 10:18:43 +01:00
"<style>#sk-container-id-2 {\n",
2024-03-23 00:04:49 +01:00
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 pre {\n",
2024-03-23 00:04:49 +01:00
" padding: 0;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 input.sk-hidden--visually {\n",
2024-03-23 00:04:49 +01:00
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-dashed-wrapped {\n",
2024-03-23 00:04:49 +01:00
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-container {\n",
2024-03-23 00:04:49 +01:00
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-text-repr-fallback {\n",
2024-03-23 00:04:49 +01:00
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-parallel-item::after {\n",
2024-03-23 00:04:49 +01:00
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-parallel {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-parallel-item {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
2024-03-23 00:04:49 +01:00
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
2024-03-23 00:04:49 +01:00
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
2024-03-23 00:04:49 +01:00
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-serial {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-toggleable {\n",
2024-03-23 00:04:49 +01:00
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
2024-03-23 00:04:49 +01:00
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-toggleable__content {\n",
2024-03-23 00:04:49 +01:00
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-toggleable__content pre {\n",
2024-03-23 00:04:49 +01:00
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
2024-03-23 00:04:49 +01:00
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
2024-03-23 00:04:49 +01:00
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-2 div.sk-label label {\n",
2024-03-23 00:04:49 +01:00
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-label label {\n",
2024-03-23 00:04:49 +01:00
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-label-container {\n",
2024-03-23 00:04:49 +01:00
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-estimator {\n",
2024-03-23 00:04:49 +01:00
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-estimator.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-estimator:hover {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 a.estimator_doc_link {\n",
2024-03-23 00:04:49 +01:00
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 a.estimator_doc_link.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 a.estimator_doc_link:hover {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
2024-03-23 10:18:43 +01:00
"#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
2024-03-23 10:18:43 +01:00
"</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3, error_score='raise',\n",
2024-03-23 00:04:49 +01:00
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
2024-03-23 10:18:43 +01:00
" scoring=make_scorer(recall_score, response_method='predict'))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-9\" type=\"checkbox\" ><label for=\"sk-estimator-id-9\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> GridSearchCV<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.model_selection.GridSearchCV.html\">?<span>Documentation for GridSearchCV</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>GridSearchCV(cv=3, error_score='raise',\n",
2024-03-23 00:04:49 +01:00
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
2024-03-23 10:18:43 +01:00
" scoring=make_scorer(recall_score, response_method='predict'))</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-10\" type=\"checkbox\" ><label for=\"sk-estimator-id-10\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">estimator: Pipeline</label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[('preprocessor',\n",
2024-03-23 00:04:49 +01:00
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in', 'gender_male',\n",
" 'gender_female'])])),\n",
" ('LogisticRegression_cv',\n",
2024-03-23 10:18:43 +01:00
" LogisticRegression(max_iter=5000, solver='saga'))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-11\" type=\"checkbox\" ><label for=\"sk-estimator-id-11\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>ColumnTransformer(transformers=[('num',\n",
2024-03-23 00:04:49 +01:00
" Pipeline(steps=[('scaler', StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases', 'total_amount',\n",
" 'nb_suppliers', 'vente_internet_max',\n",
" 'purchase_date_min', 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet', 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
2024-03-23 10:18:43 +01:00
" ['opt_in', 'gender_male', 'gender_female'])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-12\" type=\"checkbox\" ><label for=\"sk-estimator-id-12\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">num</label><div class=\"sk-toggleable__content fitted\"><pre>['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-13\" type=\"checkbox\" ><label for=\"sk-estimator-id-13\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> StandardScaler<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>StandardScaler()</pre></div> </div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-14\" type=\"checkbox\" ><label for=\"sk-estimator-id-14\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">cat</label><div class=\"sk-toggleable__content fitted\"><pre>['opt_in', 'gender_male', 'gender_female']</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-15\" type=\"checkbox\" ><label for=\"sk-estimator-id-15\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> OneHotEncoder<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>OneHotEncoder(handle_unknown='ignore', sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-16\" type=\"checkbox\" ><label for=\"sk-estimator-id-16\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression(max_iter=5000, solver='saga')</pre></div> </div></div></div></div></div></div></div></div></div></div></div>"
2024-03-23 00:04:49 +01:00
],
"text/plain": [
"GridSearchCV(cv=3, error_score='raise',\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
" scoring=make_scorer(recall_score, response_method='predict'))"
]
},
2024-03-23 10:18:43 +01:00
"execution_count": 11,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logit_cv"
]
},
{
"cell_type": "markdown",
"id": "006819e7-e9c5-48d9-85ee-aa43d5e4c9c2",
"metadata": {},
"source": [
"## Quartile clustering"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 12,
2024-03-23 00:04:49 +01:00
"id": "018d8ff4-3436-4eec-8507-d1a265cbabf1",
"metadata": {},
"outputs": [],
"source": [
"y_pred = logit_cv.predict(X_test)\n",
"y_pred_prob = logit_cv.predict_proba(X_test)[:, 1]"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 13,
2024-03-23 00:04:49 +01:00
"id": "846f53b9-73c2-4a8b-9d9e-f11bf59ce9ba",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-03-23 10:18:43 +01:00
"/tmp/ipykernel_519/375041546.py:3: SettingWithCopyWarning: \n",
2024-03-23 00:04:49 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"has_purchased\"] = y_test\n",
2024-03-23 10:18:43 +01:00
"/tmp/ipykernel_519/375041546.py:4: SettingWithCopyWarning: \n",
2024-03-23 00:04:49 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"has_purchased_estim\"] = y_pred\n",
2024-03-23 10:18:43 +01:00
"/tmp/ipykernel_519/375041546.py:5: SettingWithCopyWarning: \n",
2024-03-23 00:04:49 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"score\"] = y_pred_prob\n",
2024-03-23 10:18:43 +01:00
"/tmp/ipykernel_519/375041546.py:6: SettingWithCopyWarning: \n",
2024-03-23 00:04:49 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>fidelity</th>\n",
" <th>...</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.657671</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.266538</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.214668</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.657770</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>363.061678</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.894173</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>60.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.140069</td>\n",
" <td>5.140069</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.717482</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>5.0</td>\n",
" <td>1.0</td>\n",
" <td>61.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>105.053773</td>\n",
" <td>105.053773</td>\n",
" <td>0.000000</td>\n",
" <td>5.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.541855</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>63.206030</td>\n",
" <td>63.206030</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.461164</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>10.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>44.698090</td>\n",
" <td>44.698090</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.310828</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>165.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>266.012106</td>\n",
" <td>258.012106</td>\n",
" <td>8.000000</td>\n",
" <td>3.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.452877</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10 rows × 21 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers vente_internet_max \\\n",
"0 4.0 1.0 100.0 1.0 0.0 \n",
"1 1.0 1.0 55.0 1.0 0.0 \n",
"2 17.0 1.0 80.0 1.0 0.0 \n",
"3 4.0 1.0 120.0 1.0 0.0 \n",
"4 34.0 2.0 416.0 1.0 0.0 \n",
"5 2.0 1.0 60.0 1.0 0.0 \n",
"6 5.0 1.0 61.0 1.0 1.0 \n",
"7 4.0 1.0 80.0 1.0 0.0 \n",
"8 1.0 1.0 10.0 1.0 0.0 \n",
"9 3.0 3.0 165.0 1.0 1.0 \n",
"\n",
" purchase_date_min purchase_date_max time_between_purchase \\\n",
"0 5.177187 5.177187 0.000000 \n",
"1 426.265613 426.265613 0.000000 \n",
"2 436.033437 436.033437 0.000000 \n",
"3 5.196412 5.196412 0.000000 \n",
"4 478.693148 115.631470 363.061678 \n",
"5 5.140069 5.140069 0.000000 \n",
"6 105.053773 105.053773 0.000000 \n",
"7 63.206030 63.206030 0.000000 \n",
"8 44.698090 44.698090 0.000000 \n",
"9 266.012106 258.012106 8.000000 \n",
"\n",
" nb_tickets_internet fidelity ... opt_in gender_female gender_male \\\n",
"0 0.0 1 ... False 1 0 \n",
"1 0.0 2 ... True 0 1 \n",
"2 0.0 2 ... True 1 0 \n",
"3 0.0 1 ... False 1 0 \n",
"4 0.0 4 ... False 1 0 \n",
"5 0.0 1 ... False 0 1 \n",
"6 5.0 1 ... False 0 0 \n",
"7 0.0 1 ... True 0 1 \n",
"8 0.0 1 ... True 0 0 \n",
"9 3.0 2 ... False 0 0 \n",
"\n",
" gender_other nb_campaigns nb_campaigns_opened has_purchased \\\n",
"0 0 0.0 0.0 0.0 \n",
"1 0 0.0 0.0 1.0 \n",
"2 0 0.0 0.0 0.0 \n",
"3 0 0.0 0.0 0.0 \n",
"4 0 0.0 0.0 1.0 \n",
"5 0 0.0 0.0 0.0 \n",
"6 1 0.0 0.0 0.0 \n",
"7 0 0.0 0.0 0.0 \n",
"8 1 0.0 0.0 0.0 \n",
"9 1 0.0 0.0 0.0 \n",
"\n",
" has_purchased_estim score quartile \n",
"0 1.0 0.657671 3 \n",
"1 0.0 0.266538 2 \n",
"2 0.0 0.214668 1 \n",
"3 1.0 0.657770 3 \n",
"4 1.0 0.894173 4 \n",
"5 1.0 0.717482 3 \n",
"6 1.0 0.541855 3 \n",
"7 0.0 0.461164 2 \n",
"8 0.0 0.310828 2 \n",
"9 0.0 0.452877 2 \n",
"\n",
"[10 rows x 21 columns]"
]
},
2024-03-23 10:18:43 +01:00
"execution_count": 13,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment = X_test\n",
"\n",
"X_test_segment[\"has_purchased\"] = y_test\n",
"X_test_segment[\"has_purchased_estim\"] = y_pred\n",
"X_test_segment[\"score\"] = y_pred_prob\n",
"X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n",
" np.where(X_test['score']<0.5, '2',\n",
" np.where(X_test['score']<0.75, '3', '4')))\n",
"X_test_segment.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0916f099-3faa-4c47-9b60-d1ee797b3c9d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "ad16b8ab-7e01-404b-971e-866e9b9d5aa4",
"metadata": {},
"source": [
"## definition of functions to compute the bias of scores and adjust it \n",
"\n",
"Le biais est calculé de la façon suivante. \n",
"En notant $\\hat{p(x_i)}$ le score calculé (estimé par la modélisation) et $p(x_i)$ le vrai score (sans biais), et $\\beta$ le logarithme du biais, on a : \\\n",
"$\\ln{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}} = \\beta + \\ln{\\frac{p(x_i)}{1-p(x_i)}}$ \\\n",
"$ \\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}} = \\exp(\\beta) . \\frac{p(x_i)}{1-p(x_i)} $ , soit : \\\n",
"$p(x_i) = {\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}}$ \\\n",
"Ce qu'on appelle biais et qu'on estime dans le code par la suite est : $B=\\exp(\\beta) $. Les probabilités ne sont donc pas biaisées si $B=1$. Il y a surestimation si $B>1$. \n",
"\n",
"On cherche le B qui permette d'ajuster les probabilités de telle sorte que la somme des scores soit égale à la somme des y_has_purchased. Cela revient à résoudre : \n",
"\n",
"\\begin{equation}\n",
"\\sum_{i}{\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}} = \\sum_{i}{Y_i}\n",
"\\end{equation}\n",
"\n",
"C'est ce que fait la fonction find_bias. \n",
"\n",
"Note sur les notations : \\\n",
"$\\hat{p(x_i)}$ correspond à ce qu'on appelle le score et $p(x_i)$ à ce qu'on appellera le score adjusted"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 14,
2024-03-23 00:04:49 +01:00
"id": "f0379536-a6c5-4b16-bde5-d0319ec1b140",
"metadata": {},
"outputs": [],
"source": [
"# compute adjusted score from odd ratios (cf formula above)\n",
"def adjusted_score(odd_ratio, bias) :\n",
" adjusted_score = odd_ratio/(bias+odd_ratio)\n",
" return adjusted_score"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 15,
2024-03-23 00:04:49 +01:00
"id": "32a0dfd0-f49d-4785-a56f-706d381bfe41",
"metadata": {},
"outputs": [],
"source": [
"# when the score is 1 we cannot compute the odd ratio, so we adjust scores equal to 1\n",
"# we set the second best score instead\n",
"\n",
"def adjust_score_1(score) :\n",
" second_best_score = np.array([element for element in score if element !=1]).max()\n",
" new_score = np.array([element if element!=1 else second_best_score for element in score]) \n",
" return new_score"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 16,
2024-03-23 00:04:49 +01:00
"id": "2dff1def-02df-413e-afce-b4aeaf7752b6",
"metadata": {},
"outputs": [],
"source": [
"def odd_ratio(score) :\n",
" return score / (1 - score)"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 17,
2024-03-23 00:04:49 +01:00
"id": "683d71fc-7442-4028-869c-49c57592d6e9",
"metadata": {},
"outputs": [],
"source": [
"# definition of a function that automatically detects the bias\n",
"\n",
"def find_bias(odd_ratios, y_objective, initial_guess=6) :\n",
" \"\"\"\n",
" results = minimize(lambda bias : (sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective)**2 ,\n",
" initial_guess , method = \"BFGS\")\n",
"\n",
" estimated_bias = results.x[0]\n",
" \"\"\"\n",
"\n",
" # faster method\n",
" bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n",
" \n",
" return bias_estimated[0]"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 18,
2024-03-23 00:04:49 +01:00
"id": "781b0d40-c954-4c54-830a-e709c8667328",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6.172331113516847"
]
},
2024-03-23 10:18:43 +01:00
"execution_count": 18,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# computation with the function defined\n",
"\n",
"bias_test_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_test_segment[\"score\"])), \n",
" y_objective = y_test[\"y_has_purchased\"].sum(),\n",
" initial_guess=6)\n",
"bias_test_set"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 19,
2024-03-23 00:04:49 +01:00
"id": "248cb862-418e-4767-9933-70c4885ecf40",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6.070461139075353"
]
},
2024-03-23 10:18:43 +01:00
"execution_count": 19,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# comparison with bias of the train set\n",
"X_train_score = logit_cv.predict_proba(X_train)[:, 1]\n",
"\n",
"bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), \n",
" y_objective = y_train[\"y_has_purchased\"].sum(),\n",
" initial_guess=6)\n",
"bias_train_set"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 20,
2024-03-23 00:04:49 +01:00
"id": "fff6cbe6-7bb3-4732-9b81-b9ac5383bbcf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"betâ test - betâ train = 0.016642008368292337\n"
]
}
],
"source": [
"print(\"betâ test - betâ train = \",np.log(bias_test_set/bias_train_set))"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 21,
2024-03-23 00:04:49 +01:00
"id": "f506870d-4a8a-4b2c-8f0b-e0789080b20c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean absolute erreur 0.001409799678121875\n"
]
}
],
"source": [
"# impact of considering a bias computed on train set instead of test set - totally neglectable\n",
"\n",
"score_adjusted_test = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"\n",
"print(\"mean absolute erreur\",abs(score_adjusted_test-score_adjusted_train).mean())"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 22,
2024-03-23 00:04:49 +01:00
"id": "8213d0e4-063b-49fa-90b7-677fc34f4c01",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-03-23 10:18:43 +01:00
"/tmp/ipykernel_519/1825363704.py:7: SettingWithCopyWarning: \n",
2024-03-23 00:04:49 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"score_adjusted\"] = score_adjusted_train\n"
]
}
],
"source": [
"# adjust scores accordingly \n",
"\n",
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"\n",
"# actually, we are not supposed to have X_test, so the biais is estimated on X_train\n",
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"X_test_segment[\"score_adjusted\"] = score_adjusted_train"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 23,
2024-03-23 00:04:49 +01:00
"id": "834d3723-2e72-4c65-9c62-e2d595c69461",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE for score : 0.15494387585189107\n",
"MSE for ajusted score : 0.08851697393139933\n",
"sum of y_has_purchased : 13690.0\n",
"sum of adjusted scores : 13825.476109871417\n"
]
}
],
"source": [
"# check \n",
"\n",
"MSE_score = ((X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"MSE_ajusted_score = ((X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"print(f\"MSE for score : {MSE_score}\")\n",
"print(f\"MSE for ajusted score : {MSE_ajusted_score}\")\n",
"\n",
"print(\"sum of y_has_purchased :\",y_test[\"y_has_purchased\"].sum())\n",
"print(\"sum of adjusted scores :\", X_test_segment[\"score_adjusted\"].sum())"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 25,
2024-03-23 00:04:49 +01:00
"id": "9f30a4dd-a9d8-405a-a7d5-5324ae88cf70",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MAE for score : 0.32116357895490416\n",
"MAE for adjusted score : 0.17359227315595824\n"
]
}
],
"source": [
"# mean absolute error - divided by 2 with out method\n",
"\n",
"MAE_score = abs(X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"MAE_ajusted_score = abs(X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"print(f\"MAE for score : {MAE_score}\")\n",
"print(f\"MAE for adjusted score : {MAE_ajusted_score}\")"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 26,
2024-03-23 00:04:49 +01:00
"id": "6f9396db-e213-408c-a596-eaeec3bc79f3",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAk0AAAHFCAYAAADv8c1wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABh7ElEQVR4nO3deVgVZf8/8PdhFRCOyo6SYiGK4IaJSIoruOCSmRaFa7igIgpqPj4mmo/mrmmpWS65hOaWphK4kYgLoriBS0qKCWKyKSIg3L8//DHfBhAHRBZ7v67rXHVmPjNzz32Gc97eZ2aOSgghQEREREQl0qjsBhARERFVBwxNRERERAowNBEREREpwNBEREREpABDExEREZECDE1ERERECjA0ERERESnA0ERERESkAEMTERERkQIMTaTYxYsXMWzYMNjY2KBGjRqoWbMmWrVqhQULFiAlJaWym/faDR06FA0aNKjsZhRrw4YNUKlUOHv2bLmtMzIyEkFBQUhLSyu3dVL1duzYMahUKhw7dqzCt/O6//62bt2KZcuWvZZ1N2jQAEOHDn0t66aKxdBEiqxduxZOTk6IiorC5MmTERISgt27d+PDDz/E6tWrMWLEiMpu4ms3Y8YM7N69u7KbUWEiIyMxa9YshiaqEl7339/rDE305tCq7AZQ1Xfy5EmMGTMG3bp1w549e6CrqyvN69atGwICAhASElKJLXy9njx5An19fbz99tuV3RR6wwgh8PTpU+jp6VV2U6o8/v29fnl5eXj27JnsPZ7kONJELzV37lyoVCp89913xf4x6ejooE+fPtLz/Px8LFiwAI0bN4auri7MzMwwePBg3L17V7Zcx44d4eDggJMnT6Jdu3bQ09NDgwYNsH79egDA/v370apVK+jr68PR0bFIMAsKCoJKpcL58+fRv39/GBkZQa1W49NPP8WDBw9ktdu2bYO7uzssLS2hp6eHJk2a4PPPP0dmZqasbujQoahZsyYuXboEd3d3GBoaokuXLtK8wl8P/Pzzz3B2doZarYa+vj4aNmyI4cOHy2ru3LmDTz/9FGZmZtDV1UWTJk2wePFi5OfnSzV//vknVCoVFi1ahCVLlsDGxgY1a9aEi4sLTp06VdLLI5Oamophw4ahTp06MDAwQO/evXHr1q0idYcOHUKXLl1gZGQEfX19uLq64vDhw7K+nTx5MgDAxsYGKpVK+rpk8uTJUKvVyMvLk+rHjx8PlUqFhQsXStMePnwIDQ0NrFixQpqWkZGBwMBA2NjYQEdHB3Xr1oW/v3+R10EIgW+//RYtWrSAnp4eateujQEDBhTZl4JjKCoqCu3bt5deg6+++krWvy+i5PVLS0tDQEAAGjZsKB3PPXv2xNWrV6WalJQU+Pr6om7dutDR0UHDhg0xffp0ZGdny9alUqkwbtw4rF69Gk2aNIGuri42btwIALhx4wa8vLxkx8k333zz0n0AgG+++QYdOnSAmZkZDAwM4OjoiAULFiA3N7fM/XX16lV0794d+vr6MDExwejRo/Ho0SNF7fnjjz8wbNgw2NraQl9fH3Xr1kXv3r1x6dKlIrVKt1P476/gb2bDhg1FalUqFYKCgqTnDx48wMiRI2FtbQ1dXV2YmprC1dUVhw4dkvpl//79uH37tnSsq1QqafmcnBzMmTNHek8zNTXFsGHDirzP5ObmYsqUKbCwsIC+vj7ee+89nDlzRlGfAcCqVavQvHlz1KxZE4aGhmjcuDH+85//yGr++usvaV90dHRgZWWFAQMG4P79+1JNad5zFixYgDlz5sDGxga6uro4evQoAODs2bPo06cP6tSpgxo1aqBly5bYvn274n15YwmiEjx79kzo6+sLZ2dnxcuMHDlSABDjxo0TISEhYvXq1cLU1FRYW1uLBw8eSHVubm7C2NhY2NnZiR9++EH89ttvwtPTUwAQs2bNEo6OjuKnn34SBw4cEG3bthW6urrir7/+kpafOXOmACDq168vJk+eLH777TexZMkSYWBgIFq2bClycnKk2i+//FIsXbpU7N+/Xxw7dkysXr1a2NjYiE6dOsnaPmTIEKGtrS0aNGgg5s2bJw4fPix+++03aV79+vWl2sjISKFSqcRHH30kDhw4II4cOSLWr18vvL29pZrk5GRRt25dYWpqKlavXi1CQkLEuHHjBAAxZswYqS4+Pl4AEA0aNBDdu3cXe/bsEXv27BGOjo6idu3aIi0trcQ+X79+vQAgrK2txfDhw8XBgwfFd999J8zMzIS1tbVITU2Vajdt2iRUKpXo16+f2LVrl9i3b5/w9PQUmpqa4tChQ0IIIRISEsT48eMFALFr1y5x8uRJcfLkSZGeni5CQkIEABEZGSmts3HjxkJPT09069ZNmrZt2zYBQMTGxgohhMjMzBQtWrQQJiYmYsmSJeLQoUNi+fLlQq1Wi86dO4v8/HxpWR8fH6GtrS0CAgJESEiI2Lp1q2jcuLEwNzcXSUlJRY4hW1tbsXr1ahEWFiZ8fX0FALFx48YS+0zJ65eRkSGaNm0qDAwMxOzZs8Vvv/0mdu7cKSZMmCCOHDkihBAiKytLNGvWTBgYGIhFixaJ0NBQMWPGDKGlpSV69uwp2yYAUbduXdGsWTOxdetWceTIEXH58mVx5coVoVarhaOjo/jxxx9FaGioCAgIEBoaGiIoKKjE/RBCiIkTJ4pVq1aJkJAQceTIEbF06VJhYmIihg0bJqtT2l9JSUnCzMxM1K1bV6xfv14cOHBAfPLJJ+Ktt94SAMTRo0dLbE94eLgICAgQO3bsEOHh4WL37t2iX79+Qk9PT1y9erVM2yn891fwN7N+/foi2wcgZs6cKT338PAQpqam4rvvvhPHjh0Te/bsEV988YUIDg4WQghx5coV4erqKiwsLKRj/eTJk0IIIfLy8kT37t2FgYGBmDVrlggLCxPff/+9qFu3rrC3txdPnjyRtVGlUonJkyeL0NBQsWTJElG3bl1hZGQkhgwZUmKf/fTTTwKAGD9+vAgNDRWHDh0Sq1evFn5+flLN3bt3haWlpexvaNu2bWL48OEiLi5OCFH695y6deuKTp06iR07dojQ0FARHx8vjhw5InR0dET79u3Ftm3bREhIiBg6dOgL+/vfhKGJSpSUlCQAiI8++khRfVxcnAAgfH19ZdNPnz4tAIj//Oc/0jQ3NzcBQJw9e1aa9vDhQ6GpqSn09PRkASkmJkYAEF9//bU0rSA0TZw4UbatLVu2CABi8+bNxbYxPz9f5ObmivDwcAFAXLhwQZo3ZMgQAUCsW7euyHKF37QXLVokAJQYaD7//HMBQJw+fVo2fcyYMUKlUolr164JIf7vDczR0VE8e/ZMqjtz5owAIH766acXbkOI/wtN77//vmz6iRMnBAAxZ84cIcTz4FKnTh3Ru3dvWV1eXp5o3ry5aNOmjTRt4cKFAoCIj4+X1WZmZgodHR0xe/ZsIcTzN3IAYurUqUJPT088ffpUCPE8+FhZWUnLzZs3T2hoaIioqCjZ+nbs2CEAiAMHDgghhDh58qQAIBYvXiyrS0hIEHp6emLKlCnStIJjqHD/2tvbCw8PjxL7TMnrN3v2bAFAhIWFvbBm9erVAoDYvn27bPr8+fMFABEaGipNAyDUarVISUmR1Xp4eIh69eqJ9PR02fRx48aJGjVqFKkvSV5ensjNzRU//vij0NTUlC2rtL+mTp0qVCqViImJkdV169ZNUWgq7NmzZyInJ0fY2trK/l5Ls51XCU01a9YU/v7+JbaxV69esvUXKAgzO3fulE2PiooSAMS3334rhPi/974XvR+9LDSNGzdO1KpVq8Sa4cOHC21tbekfIsUp7XvO22+/LfsHphDP/xHUsmVLkZubK5vu6ekpLC0tRV5eXontfJPx6zkqVwVDu4WvFGnTpg2aNGki+woIACwtLeHk5CQ9r1OnDszMzNCiRQtYWVlJ05s0aQIAuH37dpFtfvLJJ7LnAwcOhJaWltQWALh16xa8vLxgYWEBTU1NaGtrw83NDQAQFxdXZJ0ffPDBS/f13Xfflba3fft2/PXXX0Vqjhw5Ant7e7Rp00Y2fej
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# visualization\n",
"\n",
"# histogramme des probas et des probas ajustées\n",
"\n",
2024-03-23 10:18:43 +01:00
"def plot_hist_scores(df, score, score_adjusted) :\n",
2024-03-23 00:04:49 +01:00
"\n",
" plt.figure()\n",
" plt.hist(df[score], label = \"score\", alpha=0.6)\n",
" plt.hist(df[score_adjusted], label=\"adjusted score\", alpha=0.6)\n",
" plt.legend()\n",
" plt.xlabel(\"probability of a future purchase\")\n",
" plt.ylabel(\"count\")\n",
" plt.title(\"Comparison between score and adjusted score\")\n",
" plt.show()\n",
"\n",
2024-03-23 10:18:43 +01:00
"plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\")"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "markdown",
"id": "e6fae260-fab8-4f51-90dc-9b6d7314c77b",
"metadata": {},
"source": [
"## Compute number of tickets and CA by segment with the recalibrated score"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 27,
2024-03-23 00:04:49 +01:00
"id": "c618cebc-c295-47f7-bd76-b7e18778a17c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>fidelity</th>\n",
" <th>...</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>score_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.657671</td>\n",
" <td>3</td>\n",
" <td>0.240397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.266538</td>\n",
" <td>2</td>\n",
" <td>0.056482</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.214668</td>\n",
" <td>1</td>\n",
" <td>0.043089</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.657770</td>\n",
" <td>3</td>\n",
" <td>0.240478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>363.061678</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.894173</td>\n",
" <td>4</td>\n",
" <td>0.581920</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers vente_internet_max \\\n",
"0 4.0 1.0 100.0 1.0 0.0 \n",
"1 1.0 1.0 55.0 1.0 0.0 \n",
"2 17.0 1.0 80.0 1.0 0.0 \n",
"3 4.0 1.0 120.0 1.0 0.0 \n",
"4 34.0 2.0 416.0 1.0 0.0 \n",
"\n",
" purchase_date_min purchase_date_max time_between_purchase \\\n",
"0 5.177187 5.177187 0.000000 \n",
"1 426.265613 426.265613 0.000000 \n",
"2 436.033437 436.033437 0.000000 \n",
"3 5.196412 5.196412 0.000000 \n",
"4 478.693148 115.631470 363.061678 \n",
"\n",
" nb_tickets_internet fidelity ... gender_female gender_male \\\n",
"0 0.0 1 ... 1 0 \n",
"1 0.0 2 ... 0 1 \n",
"2 0.0 2 ... 1 0 \n",
"3 0.0 1 ... 1 0 \n",
"4 0.0 4 ... 1 0 \n",
"\n",
" gender_other nb_campaigns nb_campaigns_opened has_purchased \\\n",
"0 0 0.0 0.0 0.0 \n",
"1 0 0.0 0.0 1.0 \n",
"2 0 0.0 0.0 0.0 \n",
"3 0 0.0 0.0 0.0 \n",
"4 0 0.0 0.0 1.0 \n",
"\n",
" has_purchased_estim score quartile score_adjusted \n",
"0 1.0 0.657671 3 0.240397 \n",
"1 0.0 0.266538 2 0.056482 \n",
"2 0.0 0.214668 1 0.043089 \n",
"3 1.0 0.657770 3 0.240478 \n",
"4 1.0 0.894173 4 0.581920 \n",
"\n",
"[5 rows x 22 columns]"
]
},
2024-03-23 10:18:43 +01:00
"execution_count": 27,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment.head()"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 28,
2024-03-23 00:04:49 +01:00
"id": "29633dd2-8b4b-48dc-be02-52f4015e686d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>score</th>\n",
" <th>score_adjusted</th>\n",
" <th>has_purchased</th>\n",
" </tr>\n",
" <tr>\n",
" <th>quartile</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.132457</td>\n",
" <td>0.025105</td>\n",
" <td>0.015691</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.338914</td>\n",
" <td>0.079990</td>\n",
" <td>0.098486</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.630647</td>\n",
" <td>0.225757</td>\n",
" <td>0.214729</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.905216</td>\n",
" <td>0.661997</td>\n",
" <td>0.650133</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" score score_adjusted has_purchased\n",
"quartile \n",
"1 0.132457 0.025105 0.015691\n",
"2 0.338914 0.079990 0.098486\n",
"3 0.630647 0.225757 0.214729\n",
"4 0.905216 0.661997 0.650133"
]
},
2024-03-23 10:18:43 +01:00
"execution_count": 28,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment.groupby(\"quartile\")[[\"score\",\"score_adjusted\", \"has_purchased\"]].mean()"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 57,
2024-03-23 00:04:49 +01:00
"id": "a974589f-7952-4db2-bebf-7b69c6b09372",
"metadata": {},
"outputs": [],
"source": [
"def project_tickets_CA (df, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :\n",
" \n",
" duration_ratio = duration_ref/duration_projection\n",
"\n",
" df_output = df\n",
"\n",
" df_output[\"nb_tickets_projected\"] = df_output[nb_tickets] / duration_ratio\n",
" df_output[\"total_amount_projected\"] = df_output[total_amount] / duration_ratio\n",
" \n",
" df_output[\"nb_tickets_expected\"] = df_output[score_adjusted] * df_output[\"nb_tickets_projected\"]\n",
" df_output[\"total_amount_expected\"] = df_output[score_adjusted] * df_output[\"total_amount_projected\"]\n",
"\n",
" return df_output\n"
]
},
{
"cell_type": "code",
"execution_count": 124,
"id": "1e000901-717d-4851-9db2-df90998d35ed",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>fidelity</th>\n",
" <th>...</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>score_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.657671</td>\n",
" <td>3</td>\n",
" <td>0.240397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.266538</td>\n",
" <td>2</td>\n",
" <td>0.056482</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.214668</td>\n",
" <td>1</td>\n",
" <td>0.043089</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.657770</td>\n",
" <td>3</td>\n",
" <td>0.240478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>363.061678</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.894173</td>\n",
" <td>4</td>\n",
" <td>0.581920</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>15.0</td>\n",
" <td>5.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.623551</td>\n",
" <td>3</td>\n",
" <td>0.214369</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>12.0</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.682521</td>\n",
" <td>3</td>\n",
" <td>0.261526</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>-1.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>29.0</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.117192</td>\n",
" <td>1</td>\n",
" <td>0.021400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>20.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.625185</td>\n",
" <td>3</td>\n",
" <td>0.215545</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>-1.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>31.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.319585</td>\n",
" <td>2</td>\n",
" <td>0.071817</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" time_between_purchase nb_tickets_internet fidelity ... \\\n",
"0 0.000000 0.0 1 ... \n",
"1 0.000000 0.0 2 ... \n",
"2 0.000000 0.0 2 ... \n",
"3 0.000000 0.0 1 ... \n",
"4 363.061678 0.0 4 ... \n",
"... ... ... ... ... \n",
"96091 0.000000 1.0 2 ... \n",
"96092 0.000000 1.0 1 ... \n",
"96093 -1.000000 0.0 1 ... \n",
"96094 0.000000 1.0 1 ... \n",
"96095 -1.000000 0.0 2 ... \n",
"\n",
" gender_female gender_male gender_other nb_campaigns \\\n",
"0 1 0 0 0.0 \n",
"1 0 1 0 0.0 \n",
"2 1 0 0 0.0 \n",
"3 1 0 0 0.0 \n",
"4 1 0 0 0.0 \n",
"... ... ... ... ... \n",
"96091 0 1 0 15.0 \n",
"96092 0 1 0 12.0 \n",
"96093 1 0 0 29.0 \n",
"96094 0 1 0 20.0 \n",
"96095 0 1 0 31.0 \n",
"\n",
" nb_campaigns_opened has_purchased has_purchased_estim score \\\n",
"0 0.0 0.0 1.0 0.657671 \n",
"1 0.0 1.0 0.0 0.266538 \n",
"2 0.0 0.0 0.0 0.214668 \n",
"3 0.0 0.0 1.0 0.657770 \n",
"4 0.0 1.0 1.0 0.894173 \n",
"... ... ... ... ... \n",
"96091 5.0 1.0 1.0 0.623551 \n",
"96092 9.0 0.0 1.0 0.682521 \n",
"96093 3.0 0.0 0.0 0.117192 \n",
"96094 4.0 0.0 1.0 0.625185 \n",
"96095 4.0 0.0 0.0 0.319585 \n",
"\n",
" quartile score_adjusted \n",
"0 3 0.240397 \n",
"1 2 0.056482 \n",
"2 1 0.043089 \n",
"3 3 0.240478 \n",
"4 4 0.581920 \n",
"... ... ... \n",
"96091 3 0.214369 \n",
"96092 3 0.261526 \n",
"96093 1 0.021400 \n",
"96094 3 0.215545 \n",
"96095 2 0.071817 \n",
"\n",
"[96096 rows x 22 columns]"
]
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 56,
2024-03-23 00:04:49 +01:00
"id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-03-23 10:18:43 +01:00
"/tmp/ipykernel_519/3509011500.py:7: SettingWithCopyWarning: \n",
2024-03-23 00:04:49 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"nb_tickets_projected\"] = df_output[nb_tickets] / duration_ratio\n",
2024-03-23 10:18:43 +01:00
"/tmp/ipykernel_519/3509011500.py:8: SettingWithCopyWarning: \n",
2024-03-23 00:04:49 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"total_amount_projected\"] = df_output[total_amount] / duration_ratio\n",
2024-03-23 10:18:43 +01:00
"/tmp/ipykernel_519/3509011500.py:10: SettingWithCopyWarning: \n",
2024-03-23 00:04:49 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"nb_tickets_expected\"] = df_output[score_adjusted] * df_output[\"nb_tickets_projected\"]\n",
2024-03-23 10:18:43 +01:00
"/tmp/ipykernel_519/3509011500.py:11: SettingWithCopyWarning: \n",
2024-03-23 00:04:49 +01:00
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"total_amount_expected\"] = df_output[score_adjusted] * df_output[\"total_amount_projected\"]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>fidelity</th>\n",
" <th>...</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>score_adjusted</th>\n",
" <th>nb_tickets_projected</th>\n",
" <th>total_amount_projected</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.657671</td>\n",
" <td>3</td>\n",
" <td>0.240397</td>\n",
" <td>2.666667</td>\n",
" <td>66.666667</td>\n",
" <td>0.641059</td>\n",
" <td>16.026472</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.266538</td>\n",
" <td>2</td>\n",
" <td>0.056482</td>\n",
" <td>0.666667</td>\n",
" <td>36.666667</td>\n",
" <td>0.037655</td>\n",
" <td>2.071006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.214668</td>\n",
" <td>1</td>\n",
" <td>0.043089</td>\n",
" <td>11.333333</td>\n",
" <td>53.333333</td>\n",
" <td>0.488340</td>\n",
" <td>2.298068</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.657770</td>\n",
" <td>3</td>\n",
" <td>0.240478</td>\n",
" <td>2.666667</td>\n",
" <td>80.000000</td>\n",
" <td>0.641273</td>\n",
" <td>19.238202</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>363.061678</td>\n",
" <td>0.0</td>\n",
" <td>4</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.894173</td>\n",
" <td>4</td>\n",
" <td>0.581920</td>\n",
" <td>22.666667</td>\n",
" <td>277.333333</td>\n",
" <td>13.190183</td>\n",
" <td>161.385771</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>5.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.623551</td>\n",
" <td>3</td>\n",
" <td>0.214369</td>\n",
" <td>0.666667</td>\n",
" <td>44.873333</td>\n",
" <td>0.142913</td>\n",
" <td>9.619467</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.682521</td>\n",
" <td>3</td>\n",
" <td>0.261526</td>\n",
" <td>0.666667</td>\n",
" <td>40.940000</td>\n",
" <td>0.174351</td>\n",
" <td>10.706885</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>-1.000000</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.117192</td>\n",
" <td>1</td>\n",
" <td>0.021400</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.625185</td>\n",
" <td>3</td>\n",
" <td>0.215545</td>\n",
" <td>0.666667</td>\n",
" <td>52.953333</td>\n",
" <td>0.143697</td>\n",
" <td>11.413840</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>-1.000000</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" <td>...</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.319585</td>\n",
" <td>2</td>\n",
" <td>0.071817</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" time_between_purchase nb_tickets_internet fidelity ... \\\n",
"0 0.000000 0.0 1 ... \n",
"1 0.000000 0.0 2 ... \n",
"2 0.000000 0.0 2 ... \n",
"3 0.000000 0.0 1 ... \n",
"4 363.061678 0.0 4 ... \n",
"... ... ... ... ... \n",
"96091 0.000000 1.0 2 ... \n",
"96092 0.000000 1.0 1 ... \n",
"96093 -1.000000 0.0 1 ... \n",
"96094 0.000000 1.0 1 ... \n",
"96095 -1.000000 0.0 2 ... \n",
"\n",
" nb_campaigns_opened has_purchased has_purchased_estim score \\\n",
"0 0.0 0.0 1.0 0.657671 \n",
"1 0.0 1.0 0.0 0.266538 \n",
"2 0.0 0.0 0.0 0.214668 \n",
"3 0.0 0.0 1.0 0.657770 \n",
"4 0.0 1.0 1.0 0.894173 \n",
"... ... ... ... ... \n",
"96091 5.0 1.0 1.0 0.623551 \n",
"96092 9.0 0.0 1.0 0.682521 \n",
"96093 3.0 0.0 0.0 0.117192 \n",
"96094 4.0 0.0 1.0 0.625185 \n",
"96095 4.0 0.0 0.0 0.319585 \n",
"\n",
" quartile score_adjusted nb_tickets_projected total_amount_projected \\\n",
"0 3 0.240397 2.666667 66.666667 \n",
"1 2 0.056482 0.666667 36.666667 \n",
"2 1 0.043089 11.333333 53.333333 \n",
"3 3 0.240478 2.666667 80.000000 \n",
"4 4 0.581920 22.666667 277.333333 \n",
"... ... ... ... ... \n",
"96091 3 0.214369 0.666667 44.873333 \n",
"96092 3 0.261526 0.666667 40.940000 \n",
"96093 1 0.021400 0.000000 0.000000 \n",
"96094 3 0.215545 0.666667 52.953333 \n",
"96095 2 0.071817 0.000000 0.000000 \n",
"\n",
" nb_tickets_expected total_amount_expected \n",
"0 0.641059 16.026472 \n",
"1 0.037655 2.071006 \n",
"2 0.488340 2.298068 \n",
"3 0.641273 19.238202 \n",
"4 13.190183 161.385771 \n",
"... ... ... \n",
"96091 0.142913 9.619467 \n",
"96092 0.174351 10.706885 \n",
"96093 0.000000 0.000000 \n",
"96094 0.143697 11.413840 \n",
"96095 0.000000 0.000000 \n",
"\n",
"[96096 rows x 26 columns]"
]
},
2024-03-23 10:18:43 +01:00
"execution_count": 56,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment = project_tickets_CA (X_test_segment, \"nb_tickets\", \"total_amount\", \"score_adjusted\", duration_ref=1.5, duration_projection=1)\n",
"X_test_segment"
]
},
{
"cell_type": "code",
2024-03-23 10:18:43 +01:00
"execution_count": 169,
"id": "78d12889-b310-4eca-8a2a-8f2535c7b2e5",
2024-03-23 00:04:49 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
2024-03-23 10:18:43 +01:00
" <th>size</th>\n",
" <th>size_perct</th>\n",
2024-03-23 00:04:49 +01:00
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
2024-03-23 10:18:43 +01:00
" <th>perct_revenue_recovered</th>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
2024-03-23 10:18:43 +01:00
" <td>37410</td>\n",
" <td>38.929820</td>\n",
" <td>84.764915</td>\n",
" <td>1.867190e+03</td>\n",
" <td>4.384354</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
2024-03-23 10:18:43 +01:00
" <td>29517</td>\n",
" <td>30.716159</td>\n",
" <td>2899.288091</td>\n",
" <td>7.446102e+04</td>\n",
" <td>9.854069</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>20137</td>\n",
" <td>20.955087</td>\n",
" <td>10876.786661</td>\n",
" <td>3.442867e+05</td>\n",
" <td>22.842135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>9032</td>\n",
" <td>9.398934</td>\n",
" <td>215194.829104</td>\n",
" <td>9.899418e+06</td>\n",
" <td>90.107285</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile size size_perct nb_tickets_expected total_amount_expected \\\n",
"0 1 37410 38.929820 84.764915 1.867190e+03 \n",
"1 2 29517 30.716159 2899.288091 7.446102e+04 \n",
"2 3 20137 20.955087 10876.786661 3.442867e+05 \n",
"3 4 9032 9.398934 215194.829104 9.899418e+06 \n",
"\n",
" perct_revenue_recovered \n",
"0 4.384354 \n",
"1 9.854069 \n",
"2 22.842135 \n",
"3 90.107285 "
]
},
"execution_count": 169,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# compute nb tickets estimated and total amount expected\n",
"X_test_expected_CA = X_test_segment.groupby(\"quartile\")[[\"nb_tickets_expected\", \"total_amount_expected\"]].sum().reset_index()\n",
"\n",
"# number of customers by segment\n",
"X_test_expected_CA.insert(1, \"size\", X_test_segment.groupby(\"quartile\").size().values)\n",
"\n",
"# size in percent of all customers\n",
"X_test_expected_CA.insert(2, \"size_perct\", 100 * X_test_expected_CA[\"size\"]/X_test_expected_CA[\"size\"].sum())\n",
"\n",
"# compute share of CA recovered\n",
"duration_ref=1.5\n",
"duration_projection=1\n",
"duration_ratio=duration_ref/duration_projection\n",
"\n",
"X_test_expected_CA[\"perct_revenue_recovered\"] = 100 * duration_ratio * X_test_expected_CA[\"total_amount_expected\"] / \\\n",
"X_test_segment.groupby(\"quartile\")[\"total_amount\"].sum().values\n",
"\n",
"X_test_expected_CA"
]
},
2024-03-23 10:18:43 +01:00
{
"cell_type": "code",
"execution_count": 31,
"id": "f58f9151-2f91-45df-abb7-1ddcf0652adc",
"metadata": {},
"outputs": [],
"source": [
"# generalization with a function\n",
"\n",
"def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount) :\n",
" \n",
" # compute nb tickets estimated and total amount expected\n",
" df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n",
" \n",
" # number of customers by segment\n",
" df_expected_CA.insert(1, \"size\", df.groupby(segment).size().values)\n",
" \n",
" # size in percent of all customers\n",
" df_expected_CA.insert(2, \"size_perct\", 100 * df_expected_CA[\"size\"]/df_expected_CA[\"size\"].sum())\n",
" \n",
" # compute share of CA recovered\n",
" duration_ref=1.5\n",
" duration_projection=1\n",
" duration_ratio=duration_ref/duration_projection\n",
" \n",
" df_expected_CA[\"perct_revenue_recovered\"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \\\n",
" df.groupby(segment)[total_amount].sum().values\n",
" \n",
" return df_expected_CA"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "c8df6c80-43e8-4f00-9cd3-eb9022744313",
"metadata": {},
"outputs": [],
"source": [
"round(summary_expected_CA(df=X_test_segment, segment=\"quartile\", nb_tickets_expected=\"nb_tickets_expected\", \n",
" total_amount_expected=\"total_amount_expected\", total_amount=\"total_amount\"),2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d45dbf34-26f4-4340-91b9-ab6389b5466f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "88ea1b3d-01ba-4edf-aecf-0a6747a86ca6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac706ed7-defa-4df1-82e1-06f12fc1b6ad",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "26cc273c-17b5-4f46-89e9-773092d6e53a",
"metadata": {},
"outputs": [],
"source": []
},
2024-03-23 00:04:49 +01:00
{
"cell_type": "markdown",
"id": "9c471bdd-25c2-420a-a8a1-3add9f003cbc",
"metadata": {},
"source": [
"## Just to try, same computation with score instead of score adjusted\n",
"\n",
"seems overestimated : if only 14% of customers come back, how can we recover 22% of the revenue from the segment that is least likely to buy ?? ..."
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "53684a24-1809-465f-8e21-b9295e34582a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_620/3599949626.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"nb_tickets_projected\"] = df_output[nb_tickets] / duration_ratio\n",
"/tmp/ipykernel_620/3599949626.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"total_amount_projected\"] = df_output[total_amount] / duration_ratio\n",
"/tmp/ipykernel_620/3599949626.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"nb_tickets_expected\"] = df_output[score_adjusted] * df_output[\"nb_tickets_projected\"]\n",
"/tmp/ipykernel_620/3599949626.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"total_amount_expected\"] = df_output[score_adjusted] * df_output[\"total_amount_projected\"]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
" <th>size</th>\n",
" <th>size_perct</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" <th>perct_revenue_recovered</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>37410</td>\n",
" <td>38.929820</td>\n",
" <td>419.757918</td>\n",
" <td>9.245081e+03</td>\n",
" <td>21.708404</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>29517</td>\n",
" <td>30.716159</td>\n",
" <td>11549.060736</td>\n",
" <td>2.965220e+05</td>\n",
" <td>39.241320</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>20137</td>\n",
" <td>20.955087</td>\n",
" <td>29997.854731</td>\n",
" <td>9.547519e+05</td>\n",
" <td>63.344224</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>9032</td>\n",
" <td>9.398934</td>\n",
" <td>244655.821195</td>\n",
" <td>1.073601e+07</td>\n",
" <td>97.722201</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile size size_perct nb_tickets_expected total_amount_expected \\\n",
"0 1 37410 38.929820 419.757918 9.245081e+03 \n",
"1 2 29517 30.716159 11549.060736 2.965220e+05 \n",
"2 3 20137 20.955087 29997.854731 9.547519e+05 \n",
"3 4 9032 9.398934 244655.821195 1.073601e+07 \n",
"\n",
" perct_revenue_recovered \n",
"0 21.708404 \n",
"1 39.241320 \n",
"2 63.344224 \n",
"3 97.722201 "
]
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment_bis = project_tickets_CA (X_test_segment, \"nb_tickets\", \"total_amount\", \"score\", duration_ref=1.5, duration_projection=1)\n",
"\n",
"# compute nb tickets estimated and total amount expected\n",
"X_test_expected_CA_bis = X_test_segment_bis.groupby(\"quartile\")[[\"nb_tickets_expected\", \"total_amount_expected\"]].sum().reset_index()\n",
"\n",
"# number of customers by segment\n",
"X_test_expected_CA_bis.insert(1, \"size\", X_test_segment_bis.groupby(\"quartile\").size().values)\n",
"\n",
"# size in percent of all customers\n",
"X_test_expected_CA_bis.insert(2, \"size_perct\", 100 * X_test_expected_CA_bis[\"size\"]/X_test_expected_CA_bis[\"size\"].sum())\n",
"\n",
"# compute share of CA recovered\n",
"duration_ref=1.5\n",
"duration_projection=1\n",
"duration_ratio=duration_ref/duration_projection\n",
"\n",
"X_test_expected_CA_bis[\"perct_revenue_recovered\"] = 100 * duration_ratio * X_test_expected_CA_bis[\"total_amount_expected\"] / \\\n",
"X_test_segment_bis.groupby(\"quartile\")[\"total_amount\"].sum().values\n",
"\n",
"X_test_expected_CA_bis"
]
},
{
"cell_type": "code",
"execution_count": 203,
"id": "7dc66d1e-da03-4513-96e4-d9a43ac0a2c8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"overall share of revenue recovered : 90.26 %\n"
]
}
],
"source": [
"print(\"overall share of revenue recovered : \", round(100 * duration_ratio * X_test_expected_CA_bis[\"total_amount_expected\"].sum() / \\\n",
"X_test_segment_bis[\"total_amount\"].sum(),2), \"%\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67cc9c5c-fff2-4d3c-8bfc-b59e06fa6e3a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "aab045f6-81a1-4c02-9724-eec32b30a355",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "673f2969-7b9a-44c1-abf5-5679fca877ce",
"metadata": {},
"source": [
"## Last pieces of analysis"
]
},
{
"cell_type": "code",
"execution_count": 161,
"id": "2365bb13-0f3f-49d5-bf91-52c92abebcee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"overall share of revenue recovered : 77.64%\n"
]
}
],
"source": [
"# global revenue recovered\n",
"global_revenue_recovered = round(100 * duration_ratio * X_test_expected_CA[\"total_amount_expected\"].sum() / \\\n",
"X_test_segment[\"total_amount\"].sum(),2)\n",
"print(f\"overall share of revenue recovered : {global_revenue_recovered}%\")"
]
},
{
"cell_type": "code",
"execution_count": 163,
"id": "16b17f35-57dd-459a-8989-129143dc0952",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.018093\n",
"1 0.721519\n",
"2 3.336101\n",
"3 95.924287\n",
"Name: total_amount_expected, dtype: float64"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"100 * X_test_expected_CA[\"total_amount_expected\"]/X_test_expected_CA[\"total_amount_expected\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": 166,
"id": "dee4a200-eefe-4377-8e80-59ad33edd3c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
"1 0.320407\n",
"2 5.685020\n",
"3 11.339715\n",
"4 82.654858\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# le segment 4 représente 83% du CA actuel et 96% du CA lié aux anciens clients pour l'année prochaine\n",
"100 * X_test_segment.groupby(\"quartile\")[\"total_amount\"].sum()/X_test_segment[\"total_amount\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a30506c-2175-4efd-b3cb-349ad3aaa3e3",
"metadata": {},
"outputs": [],
"source": [
"# graphique - loi de Pareto sur le CA généré\n"
]
},
{
"cell_type": "code",
"execution_count": 177,
"id": "c1e6f020-ef18-40b4-bfc1-19f98cb2796e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 207.475735\n",
"std 4720.046248\n",
"min -48831.800000\n",
"25% 0.000000\n",
"50% 0.000000\n",
"75% 60.000000\n",
"max 624890.000000\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 177,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[\"total_amount\"].describe() # total amount négatif ???\n"
]
},
{
"cell_type": "code",
"execution_count": 184,
"id": "d301a50e-7c68-40f0-9245-a4eea64c387b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 -4.883180e+04\n",
"1 -6.483180e+04\n",
"2 -7.683860e+04\n",
"3 -8.683860e+04\n",
"4 -9.683860e+04\n",
" ... \n",
"96091 1.802247e+07\n",
"96092 1.839238e+07\n",
"96093 1.877219e+07\n",
"96094 1.931270e+07\n",
"96095 1.993759e+07\n",
"Name: total_amount, Length: 96096, dtype: float64"
]
},
"execution_count": 184,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.cumsum(X_test_segment[\"total_amount\"].sort_values()).reset_index()[\"total_amount\"]"
]
},
{
"cell_type": "code",
"execution_count": 200,
"id": "864d0206-7f5e-4d33-8f4b-fe685c3bd916",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABHRElEQVR4nO3de1xUdf4/8NdwmeE+yh0EQRQMNDVhvWBmWmBabuq2umveLxvdvJBarLtZVl9LV8Xylq2XdTPTTFvrRyZp3lBLELQUFQVBZJBA5SJym/n8/iDGRkBmcIbDDK/n4zGPmjPn8p7j1Hn5+XzO58iEEAJEREREFsJK6gKIiIiIjInhhoiIiCwKww0RERFZFIYbIiIisigMN0RERGRRGG6IiIjIojDcEBERkUWxkbqAlqbRaJCXlwdnZ2fIZDKpyyEiIiI9CCFQWloKX19fWFndv22mzYWbvLw8+Pv7S10GERERNcPVq1fh5+d333XaXLhxdnYGUHtyXFxcJK6GiIiI9FFSUgJ/f3/tdfx+2ly4qeuKcnFxYbghIiIyM/oMKeGAYiIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFYbghIiIii8JwQ0RERBaF4YaIiIgsCsMNERERWRSGGyIiIrIoDDdERERkURhuiIiIyKIw3BAREZFFaXMPziQi/QghUFmjQWW1BhU1alSrNVKX1KimHqTX1GP29HgOH2RN7KWpfTR5iCa3b7rIB63B1OdRn+/Q5HmQ+DvqV8OD/VZMfXz99tHU9o2vIQNgZfWAX/IBSRpuDh8+jKVLlyIlJQUqlQq7d+/GyJEj77vNoUOHEBsbi7Nnz8LX1xfz589HTExMyxRMZCaq1RrcuF2FW+XVKL5zz6u8qt6yssoaVFRrUFGtRmXN3X8SERlKJgNmDgnGnKgQyWqQNNzcvn0bPXv2xJQpU/CnP/2pyfWzsrIwfPhwzJgxA59++imSkpLw0ksvwcPDQ6/ticxdjVqDottVuF5SgYKSSlwvrcD1kkoUlFTULiutxPWSShTdroQQxjuuTAbYWlvp9bfalqbX12xiJaHHXpo6n/rUIZrYiX770GMlIgkJAfwv7VrbDTfDhg3DsGHD9F5/3bp16NixI+Lj4wEAoaGhSE5Oxr/+9a9Gw01lZSUqKyu170tKSh6oZiJTEkLg17JKXL1Rjqs37iDnRjmu3ihHzo1y5N68A1XxHWj0vLhZyQClve3dl4P8t3+30V1ubwtnO1vY2VpDYWOl8087WysobKxhay1rsjmfzEeTIUuP31hTqzR1DP32oU8dD/5dmjxGWwrHD3jOf7lWgvEbfkRFtbQtv2Y15ub48eOIjo7WWTZ06FBs2LAB1dXVsLW1rbfN4sWL8fbbb7dUiUR6qahW40rRbVwuuI3MX8tw+dcyZBbeRuavt1FWWXPfba1kgIezAl4udvB0toOniwJeznbwcqldVveZq6Mc1hL3e1Pr1OS4E6P8bPjba4tcHeUAALXETYxmFW7y8/Ph5eWls8zLyws1NTUoLCyEj49PvW3i4uIQGxurfV9SUgJ/f3+T10oE1IaYi9dLcV5VivP5pb+FmDLk3rzT6N+grGSAj9Ie/q728G/vgI6uDujo5gC/9g7wb28PNycFQwsRtUr+rvb4ZGIE7GylvRnbrMINUP9vHHXNdI39TUShUEChUJi8LmrbhBBQFVfgfH4J0lWlSFeVIF1VgqzC2412I7nY2aCzpxOC3J0Q5OGIzh5O6OzhiI5uDlDYWLfsFyAiMgJnO1tEhXk1vaKJmVW48fb2Rn5+vs6ygoIC2NjYwM3NTaKqqK0RQuDarTs4k1uM01dv4UxuMc6pSlB8p7rB9V0d5Qj1cUZXLxcEezkhyN0RnT2d4OYo5zgWIiITMKtw079/f3z99dc6y/bt24eIiIgGx9sQGcON21U4nXtLG2ROX72FottV9daztpKhs4cjQn1c8JC3C0J9nBHm4wIPZwVDDBFRC5I03JSVleHSpUva91lZWUhLS4Orqys6duyIuLg4XLt2DVu2bAEAxMTEYNWqVYiNjcWMGTNw/PhxbNiwAdu2bZPqK5CFqVFrcD6/FKdybiIl+yZO5dzE1Rt36q1nYyVDV29n9PRvh55+SnTzVaKLpxPsbNmdREQkNUnDTXJyMgYPHqx9Xzfwd9KkSdi8eTNUKhVycnK0n3fq1AkJCQmYM2cOVq9eDV9fX3z44Yec44aa7VZ5FVJzbiEluzbMnM69hfIqdb31gtwd0dO/HXr4KdHTvx3CfFwYZIiIWimZ0GcyAgtSUlICpVKJ4uJiuLi4SF0OtbAbt6twIrMIxy8X4URmETIKyuqt46ywwSMB7RHesT16B7RDD792UNqz25OISEqGXL/NaswNkaFulVfhROYNnMisDTPn80vrrdPJ3RG9O7ZHeEDtK9jTSfLnohARUfMx3JBFKa+qwfHLRTh2ubZ1Jj2/pN58Ml29nNG/sxv6BbniD4GucHPiVAFERJaE4YbMmhAClwrKcOjirzh44Vf8lHUDVfc8vbqLpxP6B7mhX5Ab+ga5wp1hhojIojHckNkpq6xB0qVCHLr4Kw5d+BXXbunezeTX3h4Dgz20rTOeznYSVUpERFJguKFWTwiBjIIyHDhfgEMXfkVy9g1Uq+/2NcltrNAvyA2DQjzweFcPBLk7cl4ZIqI2jOGGWqUatQbJ2Tfx/bnrSEy/juyicp3PA90c8HhXTwwK8UC/IDfYy3lbNhER1WK4oVajolqNIxmF+PYXFX44X4Cb5XcfZyC3sUJkZzcM/i3QBLo7SlgpERG1Zgw3JKnKGjWOZhTi/51RIfHcdZRW1mg/a+dgiyEPeSI6zAsDgz3gqODPlYiImsarBbW4arUGRzMK8c0ZFfady0dpxd1A4+1ih2EPe2NoN29EBLSHjbWVhJUSEZE5YrihFnM2rxhfplzDntPXUFh298GTXi4KDH/YB8/08MEj/u05gR4RET0QhhsyqYLSCvwvNQ9fnsrVmR3Y3UmOpx/2wTM9fRHekYGGiIiMh+GGjK5GrcH+8wX4/KccHM4ohFpTe9u23NoKUWFeGN27Ax4L8YAtu5yIiMgEGG7IaPJu3cHnJ69i+8kcXC+p1C7v3bEdRvf2w4gevlA68AGURERkWgw39EDUGoHDF3/F1h+zceB8AX5rpIGboxx/jvDHmAg/BHk4SVskERG1KQw31CylFdXYkZyLzceycPXG3ccf9A9yw7i+HTG0mzfkNux2IiKilsdwQwa5eqMcm49dwfaTV1H225w0SntbPBfuh7/26YgunmylISIiaTHckF5Ssm/ik8OZ2HcuX9v11NnDEVMf7YTRj/jx8QdERNRqMNxQo4QQOH65CB8duITjmUXa5QOD3THt0U54LNiDt3ATEVGrw3BD9Qgh8MOFAqw6cAmncm4BAGytZRj9iB+mDeyEEC9naQskIiK6D4Yb0hJC4ODFX7F830X8fK0YAKCwscJf+3TE3x4Lgm87e4krJCIiahrDDQEATmQW4V/fXUBy9k0AgIPcGhP6BWDawE7wdLaTuDoiIiL9Mdy0cT/nFmPJd+dxJKMQQG1LzcT+AYgZ1BluTgqJqyMiIjIcw00bdb2kAku/u4AvT+VCCMDGSoa/9PHHK4OD4a1kSw0REZkvhps2pqJajQ1Hs7D6h0sor1IDAJ7t5YvXorqio5uDxNURERE9OIabNkIIgb2/5OPd/5eOa7dqZxTu5d8Ob44IQ++O7SWujoiIyHgYbtqAvFt38M+vfsH+8wUAAG8XO7wx7CH8sacv56khIiKLw3BjwdQagU9PZGPJ3vO4XaWGrbUMLw7qjJjHO8NBzj96IiKyTLzCWaiL10vxxpdntJPwhQe0x/ujH0YwJ+AjIiILx3BjYTQagY1JWViy9wKq1Bo4KWzw+lNd8XzfAHZBERFRm8BwY0Gul1Rg7hentXPWDHnIE++N6g4fJWcWJiKitoPhxkLsO5uP1788g5vl1bCztcI/nwnDuD4dIZOxtYaIiNoWhhszV6PWYOl3F/Dx4UwAQDdfF6z8yyPo4ukkcWVERETSYLgxY4VllXj
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# graphic for cumulated revenue\n",
"\n",
"plt.figure()\n",
"plt.plot(X_test_segment.index/X_test_segment.index.max(), \n",
" np.cumsum(X_test_segment[\"total_amount\"].sort_values(ascending=False)).values/ \\\n",
" np.sum(X_test_segment[\"total_amount\"]))\n",
"plt.xlabel(\"fraction of customers considered\")\n",
"plt.ylabel(\"cumulated revenue\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 198,
"id": "67981e78-d7a5-432e-b93b-9d0d189f4e5d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"96095"
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment.index.max()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}