2024-03-23 00:04:49 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "84b6e27e-4bda-4d38-8689-ec7fc0da1848",
"metadata": {},
"source": [
"# Define segment and predict sales associated"
]
},
{
"cell_type": "markdown",
"id": "ec059482-45d3-4ae6-99bc-9b4ced115db3",
"metadata": {},
"source": [
"## Importations of packages "
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 1,
2024-03-23 00:04:49 +01:00
"id": "9771bf29-d08e-4674-8c23-9a2672fbef8f",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
2024-03-23 10:18:43 +01:00
"from pandas import DataFrame\n",
2024-03-23 00:04:49 +01:00
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n",
"from sklearn.utils import class_weight\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
"from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from scipy.optimize import fsolve\n",
2024-03-24 10:42:44 +01:00
"import io\n",
2024-03-23 00:04:49 +01:00
"\n",
"import pickle\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "048fcd7c-800a-4a6b-b725-faf8410f924a",
"metadata": {},
"source": [
"## load databases"
]
},
{
"cell_type": "code",
2024-03-30 12:00:49 +01:00
"execution_count": 2,
2024-03-23 00:04:49 +01:00
"id": "539ccbdf-f29f-4f04-99c1-8c88d0efe514",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 4,
2024-03-27 18:58:30 +01:00
"id": "d6017ed0-6233-4888-85a7-05dec50a255b",
"metadata": {},
"outputs": [],
"source": [
2024-04-03 12:36:47 +02:00
"type_of_activity = \"sport\""
2024-03-27 18:58:30 +01:00
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 7,
2024-03-23 00:04:49 +01:00
"id": "0c3a6ddc-9345-4a42-b6bf-a20a95de3028",
"metadata": {},
"outputs": [],
"source": [
2024-03-27 18:58:30 +01:00
"def load_train_test(type_of_activity):\n",
2024-03-30 12:00:49 +01:00
" # BUCKET = f\"projet-bdc2324-team1/Generalization/{type_of_activity}\"\n",
2024-04-03 12:36:47 +02:00
" BUCKET = f\"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}\"\n",
2024-03-23 00:04:49 +01:00
" File_path_train = BUCKET + \"/Train_set.csv\"\n",
" File_path_test = BUCKET + \"/Test_set.csv\"\n",
" \n",
" with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
"\n",
" with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
" \n",
" return dataset_train, dataset_test"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 8,
2024-03-23 00:04:49 +01:00
"id": "2831d546-b365-498b-8248-c618bd9c3057",
"metadata": {},
2024-03-23 10:18:43 +01:00
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-04-03 12:36:47 +02:00
"/tmp/ipykernel_427/290017524.py:8: DtypeWarning: Columns (10,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2024-03-30 12:00:49 +01:00
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
2024-04-03 12:36:47 +02:00
"/tmp/ipykernel_427/290017524.py:12: DtypeWarning: Columns (10,24) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2024-03-30 12:00:49 +01:00
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
2024-03-23 10:18:43 +01:00
]
},
{
"data": {
"text/plain": [
2024-03-30 12:00:49 +01:00
"customer_id 0\n",
"street_id 0\n",
2024-04-03 12:36:47 +02:00
"structure_id 222819\n",
"mcp_contact_id 70845\n",
2024-03-30 12:00:49 +01:00
"fidelity 0\n",
" ... \n",
2024-04-03 12:36:47 +02:00
"purchases_8_2021 0\n",
2024-03-30 12:00:49 +01:00
"purchases_8_2022 0\n",
2024-04-03 12:36:47 +02:00
"purchases_9_2021 0\n",
2024-03-30 12:00:49 +01:00
"purchases_9_2022 0\n",
"y_has_purchased 0\n",
"Length: 87, dtype: int64"
2024-03-23 10:18:43 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 8,
2024-03-23 10:18:43 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
2024-03-23 00:04:49 +01:00
"source": [
2024-03-27 18:58:30 +01:00
"dataset_train, dataset_test = load_train_test(type_of_activity)\n",
2024-03-23 00:04:49 +01:00
"dataset_train.isna().sum()"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 9,
2024-03-23 00:04:49 +01:00
"id": "b8827f7b-b304-4f51-9814-c7a98ed88cf0",
"metadata": {},
"outputs": [],
"source": [
"def features_target_split(dataset_train, dataset_test):\n",
" \n",
2024-03-30 12:00:49 +01:00
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'purchase_date_min', 'purchase_date_max', \n",
" 'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',, 'vente_internet_max'\n",
2024-03-23 00:04:49 +01:00
" 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n",
"\n",
" # we suppress fidelity, time between purchase, and gender other (colinearity issue)\n",
" \"\"\"\n",
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', \n",
" 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true', \n",
" 'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']\n",
" \"\"\"\n",
" \n",
2024-03-30 12:00:49 +01:00
" X_train = dataset_train # [features_l]\n",
2024-03-23 00:04:49 +01:00
" y_train = dataset_train[['y_has_purchased']]\n",
"\n",
2024-03-30 12:00:49 +01:00
" X_test = dataset_test # [features_l]\n",
2024-03-23 00:04:49 +01:00
" y_test = dataset_test[['y_has_purchased']]\n",
" return X_train, X_test, y_train, y_test"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 10,
2024-03-23 00:04:49 +01:00
"id": "c18195fc-ed40-4e39-a59e-c9ecc5a8e6c3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-04-03 12:36:47 +02:00
"Shape train : (224213, 87)\n",
"Shape test : (96096, 87)\n"
2024-03-23 00:04:49 +01:00
]
}
],
"source": [
"X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)\n",
"print(\"Shape train : \", X_train.shape)\n",
"print(\"Shape test : \", X_test.shape)"
]
},
{
"cell_type": "markdown",
"id": "74eda066-5e01-43aa-b0cf-cc6d9bbf770e",
"metadata": {},
"source": [
"## get results from the logit cross validated model"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 11,
2024-03-23 00:04:49 +01:00
"id": "7c81390e-598c-4f02-bd56-dd03b00dcb33",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-03-30 12:00:49 +01:00
" <th>customer_id</th>\n",
" <th>street_id</th>\n",
" <th>structure_id</th>\n",
" <th>mcp_contact_id</th>\n",
2024-03-23 00:04:49 +01:00
" <th>fidelity</th>\n",
2024-03-30 12:00:49 +01:00
" <th>tenant_id</th>\n",
" <th>is_partner</th>\n",
" <th>deleted_at</th>\n",
2024-03-23 00:04:49 +01:00
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
2024-03-30 12:00:49 +01:00
" <th>...</th>\n",
" <th>purchases_5_2022</th>\n",
" <th>purchases_6_2021</th>\n",
" <th>purchases_6_2022</th>\n",
" <th>purchases_7_2021</th>\n",
" <th>purchases_7_2022</th>\n",
" <th>purchases_8_2021</th>\n",
" <th>purchases_8_2022</th>\n",
" <th>purchases_9_2021</th>\n",
" <th>purchases_9_2022</th>\n",
" <th>y_has_purchased</th>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_4317407</td>\n",
" <td>969908</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6156473.0</td>\n",
" <td>1</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_477635</td>\n",
" <td>109121</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6213652.0</td>\n",
" <td>2</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>True</td>\n",
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_411639</td>\n",
" <td>92929</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6160271.0</td>\n",
" <td>4</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_326623</td>\n",
" <td>79862</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6140109.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1771</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-23 00:04:49 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_383915</td>\n",
" <td>85421</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6149409.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>2</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1771</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96091</th>\n",
" <td>9_91205</td>\n",
" <td>76215</td>\n",
" <td>NaN</td>\n",
" <td>47280.0</td>\n",
" <td>0</td>\n",
" <td>1490</td>\n",
2024-03-23 00:04:49 +01:00
" <td>False</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96092</th>\n",
" <td>9_369887</td>\n",
" <td>815891</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>30764537.0</td>\n",
" <td>4</td>\n",
" <td>1490</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96093</th>\n",
" <td>9_1007562</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1490</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96094</th>\n",
" <td>9_15037</td>\n",
" <td>12992</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>2213448.0</td>\n",
" <td>0</td>\n",
" <td>1490</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96095</th>\n",
" <td>9_135370</td>\n",
" <td>76215</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>2164740.0</td>\n",
" <td>0</td>\n",
" <td>1490</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-04-03 12:36:47 +02:00
"<p>96096 rows × 87 columns</p>\n",
2024-03-23 00:04:49 +01:00
"</div>"
],
"text/plain": [
2024-04-03 12:36:47 +02:00
" customer_id street_id structure_id mcp_contact_id fidelity \\\n",
"0 5_4317407 969908 NaN 6156473.0 1 \n",
"1 5_477635 109121 NaN 6213652.0 2 \n",
"2 5_411639 92929 NaN 6160271.0 4 \n",
"3 5_326623 79862 NaN 6140109.0 1 \n",
"4 5_383915 85421 NaN 6149409.0 2 \n",
"... ... ... ... ... ... \n",
"96091 9_91205 76215 NaN 47280.0 0 \n",
"96092 9_369887 815891 NaN 30764537.0 4 \n",
"96093 9_1007562 1 NaN NaN 0 \n",
"96094 9_15037 12992 NaN 2213448.0 0 \n",
"96095 9_135370 76215 NaN 2164740.0 0 \n",
"\n",
" tenant_id is_partner deleted_at is_email_true opt_in ... \\\n",
"0 1771 False NaN True 0 ... \n",
"1 1771 False NaN True 0 ... \n",
"2 1771 False NaN True 0 ... \n",
"3 1771 False NaN True 1 ... \n",
"4 1771 False NaN True 1 ... \n",
"... ... ... ... ... ... ... \n",
"96091 1490 False NaN True 1 ... \n",
"96092 1490 False NaN True 0 ... \n",
"96093 1490 False NaN True 0 ... \n",
"96094 1490 False NaN True 1 ... \n",
"96095 1490 False NaN True 1 ... \n",
"\n",
" purchases_5_2022 purchases_6_2021 purchases_6_2022 purchases_7_2021 \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"96091 0.0 0.0 0.0 0.0 \n",
"96092 0.0 0.0 0.0 0.0 \n",
"96093 0.0 0.0 0.0 0.0 \n",
"96094 0.0 0.0 0.0 0.0 \n",
"96095 0.0 0.0 0.0 0.0 \n",
"\n",
" purchases_7_2022 purchases_8_2021 purchases_8_2022 purchases_9_2021 \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 1.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"96091 0.0 0.0 0.0 0.0 \n",
"96092 0.0 0.0 0.0 0.0 \n",
"96093 0.0 0.0 0.0 0.0 \n",
"96094 0.0 0.0 0.0 0.0 \n",
"96095 0.0 0.0 0.0 0.0 \n",
"\n",
" purchases_9_2022 y_has_purchased \n",
"0 0.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 \n",
"... ... ... \n",
"96091 0.0 0.0 \n",
"96092 0.0 1.0 \n",
"96093 0.0 0.0 \n",
"96094 0.0 0.0 \n",
"96095 0.0 0.0 \n",
"\n",
"[96096 rows x 87 columns]"
2024-03-23 00:04:49 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 11,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 12,
2024-03-23 00:04:49 +01:00
"id": "c708f439-bb75-4688-bf4f-4c04e13deaae",
"metadata": {},
"outputs": [],
"source": [
"def load_model(type_of_activity, model):\n",
2024-03-30 12:00:49 +01:00
" # BUCKET = f\"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/\"\n",
" BUCKET = f\"projet-bdc2324-team1/basique/{type_of_activity}/{model}/\"\n",
2024-03-23 00:04:49 +01:00
" filename = model + '.pkl'\n",
" file_path = BUCKET + filename\n",
" with fs.open(file_path, mode=\"rb\") as f:\n",
" model_bytes = f.read()\n",
"\n",
" model = pickle.loads(model_bytes)\n",
" return model"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 13,
2024-03-23 00:04:49 +01:00
"id": "5261a803-05b8-41a0-968c-dc7bde48ddd3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
2024-04-03 12:36:47 +02:00
"<style>#sk-container-id-1 {\n",
2024-03-23 00:04:49 +01:00
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 pre {\n",
2024-03-23 00:04:49 +01:00
" padding: 0;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 input.sk-hidden--visually {\n",
2024-03-23 00:04:49 +01:00
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-dashed-wrapped {\n",
2024-03-23 00:04:49 +01:00
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-container {\n",
2024-03-23 00:04:49 +01:00
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-text-repr-fallback {\n",
2024-03-23 00:04:49 +01:00
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-parallel-item::after {\n",
2024-03-23 00:04:49 +01:00
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-parallel {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-parallel-item {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
2024-03-23 00:04:49 +01:00
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
2024-03-23 00:04:49 +01:00
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
2024-03-23 00:04:49 +01:00
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-serial {\n",
2024-03-23 00:04:49 +01:00
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-toggleable {\n",
2024-03-23 00:04:49 +01:00
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
2024-03-23 00:04:49 +01:00
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-toggleable__content {\n",
2024-03-23 00:04:49 +01:00
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-toggleable__content pre {\n",
2024-03-23 00:04:49 +01:00
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
2024-03-23 00:04:49 +01:00
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
2024-03-23 00:04:49 +01:00
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-1 div.sk-label label {\n",
2024-03-23 00:04:49 +01:00
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
2024-03-23 00:04:49 +01:00
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-label label {\n",
2024-03-23 00:04:49 +01:00
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-label-container {\n",
2024-03-23 00:04:49 +01:00
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-estimator {\n",
2024-03-23 00:04:49 +01:00
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-estimator.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-estimator:hover {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 a.estimator_doc_link {\n",
2024-03-23 00:04:49 +01:00
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 a.estimator_doc_link.fitted {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 a.estimator_doc_link:hover {\n",
2024-03-23 00:04:49 +01:00
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
2024-04-03 12:36:47 +02:00
"#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
2024-03-23 00:04:49 +01:00
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
2024-04-03 12:36:47 +02:00
"</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3, error_score='raise',\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value=0,\n",
" strategy='constant')),\n",
" ('scaler',\n",
" StandardScaler())]),\n",
" ['nb_campaigns',\n",
" 'taux_ouverture_mail',\n",
" 'prop_purchases_internet',\n",
" 'nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'pu...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5834990214856762,\n",
" 1.0: 3.49404706249026}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
" scoring=make_scorer(recall_score, response_method='predict'))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> GridSearchCV<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.model_selection.GridSearchCV.html\">?<span>Documentation for GridSearchCV</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>GridSearchCV(cv=3, error_score='raise',\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value=0,\n",
" strategy='constant')),\n",
" ('scaler',\n",
" StandardScaler())]),\n",
" ['nb_campaigns',\n",
" 'taux_ouverture_mail',\n",
" 'prop_purchases_internet',\n",
" 'nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'pu...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5834990214856762,\n",
" 1.0: 3.49404706249026}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
" scoring=make_scorer(recall_score, response_method='predict'))</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">estimator: Pipeline</label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[('preprocessor',\n",
2024-03-23 00:04:49 +01:00
" ColumnTransformer(transformers=[('num',\n",
2024-03-30 12:00:49 +01:00
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value=0,\n",
" strategy='constant')),\n",
" ('scaler',\n",
2024-03-23 00:04:49 +01:00
" StandardScaler())]),\n",
2024-03-30 12:00:49 +01:00
" ['nb_campaigns',\n",
" 'taux_ouverture_mail',\n",
" 'prop_purchases_internet',\n",
" 'nb_tickets', 'nb_purchases',\n",
2024-03-23 00:04:49 +01:00
" 'total_amount',\n",
" 'nb_suppliers',\n",
2024-03-30 12:00:49 +01:00
" 'purchases_10_2021',\n",
" 'purchases_10_2022',\n",
" 'purchases_...\n",
2024-04-03 12:36:47 +02:00
" 'categorie_age_0_10',\n",
" 'categorie_age_10_20',\n",
" 'categorie_age_20_30',\n",
" 'categorie_age_30_40',\n",
2024-03-30 12:00:49 +01:00
" 'categorie_age_40_50',\n",
" 'categorie_age_50_60',\n",
" 'categorie_age_60_70',\n",
" 'categorie_age_70_80',\n",
" 'categorie_age_plus_80',\n",
" 'categorie_age_inconnue',\n",
" 'country_fr',\n",
" 'is_profession_known',\n",
" 'is_zipcode_known',\n",
" 'opt_in'])])),\n",
2024-04-03 12:36:47 +02:00
" ('LogisticRegression_cv',\n",
" LogisticRegression(max_iter=5000, solver='saga'))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>ColumnTransformer(transformers=[('num',\n",
2024-03-30 12:00:49 +01:00
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value=0,\n",
" strategy='constant')),\n",
" ('scaler', StandardScaler())]),\n",
" ['nb_campaigns', 'taux_ouverture_mail',\n",
" 'prop_purchases_internet', 'nb_tickets',\n",
" 'nb_purchases', 'total_amount',\n",
" 'nb_suppliers', 'purchases_10_2021',\n",
" 'purchases_10_2022', 'purchases_11_2021',\n",
" 'purchases_12_2021', 'pu...\n",
" SimpleImputer(strategy='most_frequent'))]),\n",
" ['gender_female', 'gender_male',\n",
" 'achat_internet', 'categorie_age_0_10',\n",
" 'categorie_age_10_20', 'categorie_age_20_30',\n",
" 'categorie_age_30_40', 'categorie_age_40_50',\n",
" 'categorie_age_50_60', 'categorie_age_60_70',\n",
" 'categorie_age_70_80',\n",
" 'categorie_age_plus_80',\n",
" 'categorie_age_inconnue', 'country_fr',\n",
" 'is_profession_known', 'is_zipcode_known',\n",
2024-04-03 12:36:47 +02:00
" 'opt_in'])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">num</label><div class=\"sk-toggleable__content fitted\"><pre>['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'purchases_10_2021', 'purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021', 'purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022', 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021', 'purchases_8_2022', 'purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> SimpleImputer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.impute.SimpleImputer.html\">?<span>Documentation for SimpleImputer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>SimpleImputer(fill_value=0, strategy='constant')</pre></div> </div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> StandardScaler<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>StandardScaler()</pre></div> </div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-7\" type=\"checkbox\" ><label for=\"sk-estimator-id-7\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">bin</label><div class=\"sk-toggleable__content fitted\"><pre>['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30', 'categorie_age_30_40', 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80', 'categorie_age_inconnue', 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in']</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-8\" type=\"checkbox\" ><label for=\"sk-estimator-id-8\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nb
2024-03-23 00:04:49 +01:00
],
"text/plain": [
2024-04-03 12:36:47 +02:00
"GridSearchCV(cv=3, error_score='raise',\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('imputer',\n",
" SimpleImputer(fill_value=0,\n",
" strategy='constant')),\n",
" ('scaler',\n",
" StandardScaler())]),\n",
" ['nb_campaigns',\n",
" 'taux_ouverture_mail',\n",
" 'prop_purchases_internet',\n",
" 'nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'pu...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5834990214856762,\n",
" 1.0: 3.49404706249026}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
" scoring=make_scorer(recall_score, response_method='predict'))"
2024-03-23 00:04:49 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 13,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-04-03 12:36:47 +02:00
"#model = load_model(type_of_activity, \"LogisticRegression_Benchmark\")\n",
2024-03-28 11:37:23 +01:00
"# model = load_model(type_of_activity, \"randomF_cv\")\n",
2024-04-03 12:36:47 +02:00
"model = load_model(type_of_activity, \"LogisticRegression_cv\")\n",
2024-03-28 11:37:23 +01:00
"model"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "markdown",
"id": "006819e7-e9c5-48d9-85ee-aa43d5e4c9c2",
"metadata": {},
"source": [
"## Quartile clustering"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 14,
2024-03-23 00:04:49 +01:00
"id": "018d8ff4-3436-4eec-8507-d1a265cbabf1",
"metadata": {},
"outputs": [],
"source": [
2024-03-28 11:37:23 +01:00
"y_pred = model.predict(X_test)\n",
"y_pred_prob = model.predict_proba(X_test)[:, 1]"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 15,
2024-03-23 00:04:49 +01:00
"id": "846f53b9-73c2-4a8b-9d9e-f11bf59ce9ba",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-03-30 12:00:49 +01:00
" <th>customer_id</th>\n",
" <th>street_id</th>\n",
" <th>structure_id</th>\n",
" <th>mcp_contact_id</th>\n",
2024-03-23 00:04:49 +01:00
" <th>fidelity</th>\n",
2024-03-30 12:00:49 +01:00
" <th>tenant_id</th>\n",
" <th>is_partner</th>\n",
" <th>deleted_at</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
2024-03-23 00:04:49 +01:00
" <th>...</th>\n",
2024-04-03 12:36:47 +02:00
" <th>purchases_7_2022</th>\n",
2024-03-30 12:00:49 +01:00
" <th>purchases_8_2021</th>\n",
" <th>purchases_8_2022</th>\n",
" <th>purchases_9_2021</th>\n",
" <th>purchases_9_2022</th>\n",
" <th>y_has_purchased</th>\n",
2024-03-23 00:04:49 +01:00
" <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_4317407</td>\n",
" <td>969908</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6156473.0</td>\n",
" <td>1</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.445019</td>\n",
" <td>2</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_477635</td>\n",
" <td>109121</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6213652.0</td>\n",
" <td>2</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.382586</td>\n",
2024-03-30 12:00:49 +01:00
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_411639</td>\n",
" <td>92929</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6160271.0</td>\n",
" <td>4</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1.0</td>\n",
" <td>0.916747</td>\n",
" <td>4</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_326623</td>\n",
" <td>79862</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6140109.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>1</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.090534</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_383915</td>\n",
" <td>85421</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6149409.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>2</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.346571</td>\n",
" <td>2</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_233172</td>\n",
" <td>141401</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>3324.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>1</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1.0</td>\n",
" <td>0.924684</td>\n",
" <td>4</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_389999</td>\n",
" <td>95759</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6151025.0</td>\n",
" <td>1</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1.0</td>\n",
" <td>0.569031</td>\n",
" <td>3</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_4292211</td>\n",
" <td>78897</td>\n",
" <td>NaN</td>\n",
" <td>4729841.0</td>\n",
" <td>1</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.125622</td>\n",
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_353553</td>\n",
" <td>84189</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6146995.0</td>\n",
" <td>1</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>...</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.0</td>\n",
" <td>0.229432</td>\n",
2024-03-30 12:00:49 +01:00
" <td>1</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_401296</td>\n",
" <td>3491</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6155457.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>1</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1.0</td>\n",
" <td>0.503987</td>\n",
" <td>3</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-04-03 12:36:47 +02:00
"<p>10 rows × 91 columns</p>\n",
2024-03-23 00:04:49 +01:00
"</div>"
],
"text/plain": [
2024-03-30 12:00:49 +01:00
" customer_id street_id structure_id mcp_contact_id fidelity tenant_id \\\n",
2024-04-03 12:36:47 +02:00
"0 5_4317407 969908 NaN 6156473.0 1 1771 \n",
"1 5_477635 109121 NaN 6213652.0 2 1771 \n",
"2 5_411639 92929 NaN 6160271.0 4 1771 \n",
"3 5_326623 79862 NaN 6140109.0 1 1771 \n",
"4 5_383915 85421 NaN 6149409.0 2 1771 \n",
"5 5_233172 141401 NaN 3324.0 1 1771 \n",
"6 5_389999 95759 NaN 6151025.0 1 1771 \n",
"7 5_4292211 78897 NaN 4729841.0 1 1771 \n",
"8 5_353553 84189 NaN 6146995.0 1 1771 \n",
"9 5_401296 3491 NaN 6155457.0 1 1771 \n",
"\n",
" is_partner deleted_at is_email_true opt_in ... purchases_7_2022 \\\n",
2024-03-30 12:00:49 +01:00
"0 False NaN True 0 ... 0.0 \n",
"1 False NaN True 0 ... 0.0 \n",
2024-04-03 12:36:47 +02:00
"2 False NaN True 0 ... 0.0 \n",
2024-03-30 12:00:49 +01:00
"3 False NaN True 1 ... 0.0 \n",
2024-04-03 12:36:47 +02:00
"4 False NaN True 1 ... 0.0 \n",
"5 False NaN True 1 ... 0.0 \n",
"6 False NaN True 0 ... 0.0 \n",
2024-03-30 12:00:49 +01:00
"7 False NaN True 1 ... 0.0 \n",
"8 False NaN True 1 ... 0.0 \n",
"9 False NaN True 0 ... 0.0 \n",
"\n",
2024-04-03 12:36:47 +02:00
" purchases_8_2021 purchases_8_2022 purchases_9_2021 purchases_9_2022 \\\n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 1.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"5 0.0 0.0 0.0 1.0 \n",
"6 0.0 0.0 0.0 0.0 \n",
"7 0.0 0.0 0.0 0.0 \n",
"8 0.0 0.0 0.0 0.0 \n",
"9 0.0 0.0 0.0 0.0 \n",
"\n",
" y_has_purchased has_purchased has_purchased_estim score quartile \n",
"0 0.0 0.0 0.0 0.445019 2 \n",
"1 0.0 0.0 0.0 0.382586 2 \n",
"2 0.0 0.0 1.0 0.916747 4 \n",
"3 0.0 0.0 0.0 0.090534 1 \n",
"4 0.0 0.0 0.0 0.346571 2 \n",
"5 0.0 0.0 1.0 0.924684 4 \n",
"6 0.0 0.0 1.0 0.569031 3 \n",
"7 0.0 0.0 0.0 0.125622 1 \n",
"8 0.0 0.0 0.0 0.229432 1 \n",
"9 0.0 0.0 1.0 0.503987 3 \n",
"\n",
"[10 rows x 91 columns]"
2024-03-23 00:04:49 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 15,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment = X_test\n",
"\n",
"X_test_segment[\"has_purchased\"] = y_test\n",
"X_test_segment[\"has_purchased_estim\"] = y_pred\n",
"X_test_segment[\"score\"] = y_pred_prob\n",
"X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n",
" np.where(X_test['score']<0.5, '2',\n",
" np.where(X_test['score']<0.75, '3', '4')))\n",
"X_test_segment.head(10)"
]
},
2024-03-27 18:58:30 +01:00
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 16,
2024-03-27 18:58:30 +01:00
"id": "fb592fe3-ea40-4e83-8fe9-c52b9ee42f2a",
"metadata": {},
"outputs": [],
"source": [
"def df_segment(df, y, model) :\n",
"\n",
" y_pred = model.predict(df)\n",
" y_pred_prob = model.predict_proba(df)[:, 1]\n",
"\n",
" df_segment = df\n",
"\n",
" df_segment[\"has_purchased\"] = y\n",
" df_segment[\"has_purchased_estim\"] = y_pred\n",
" df_segment[\"score\"] = y_pred_prob\n",
" df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n",
" np.where(df_segment['score']<0.5, '2',\n",
" np.where(df_segment['score']<0.75, '3', '4')))\n",
"\n",
" return df_segment"
]
},
{
"cell_type": "code",
2024-03-28 11:37:23 +01:00
"execution_count": 88,
2024-03-27 18:58:30 +01:00
"id": "968645d5-58cc-485a-bd8b-99f4cfc26fec",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1080/2624515794.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"has_purchased\"] = y\n",
"/tmp/ipykernel_1080/2624515794.py:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"has_purchased_estim\"] = y_pred\n",
"/tmp/ipykernel_1080/2624515794.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"score\"] = y_pred_prob\n",
"/tmp/ipykernel_1080/2624515794.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>fidelity</th>\n",
" <th>...</th>\n",
2024-03-28 11:37:23 +01:00
" <th>opt_in</th>\n",
2024-03-27 18:58:30 +01:00
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>has_purchased</th>\n",
" <th>has_purchased_estim</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2024-03-28 11:37:23 +01:00
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.006066</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>True</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.288847</td>\n",
2024-03-27 18:58:30 +01:00
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-03-28 11:37:23 +01:00
" <td>17.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>80.00</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>True</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.103264</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-03-28 11:37:23 +01:00
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.008928</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-03-28 11:37:23 +01:00
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>363.061678</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>4</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.992809</td>\n",
" <td>4</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>1</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0</td>\n",
" <td>15.0</td>\n",
" <td>5.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.351762</td>\n",
2024-03-27 18:58:30 +01:00
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>12.0</td>\n",
" <td>9.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>1.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.567814</td>\n",
2024-03-27 18:58:30 +01:00
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96093</th>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.00</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
2024-03-28 11:37:23 +01:00
" <td>-1.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>True</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>29.0</td>\n",
" <td>3.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.0</td>\n",
" <td>0.004652</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>20.0</td>\n",
" <td>4.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.293042</td>\n",
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" <tr>\n",
2024-03-28 11:37:23 +01:00
" <th>96095</th>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>0.00</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
2024-03-28 11:37:23 +01:00
" <td>-1.000000</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-03-28 11:37:23 +01:00
" <td>False</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>31.0</td>\n",
" <td>4.0</td>\n",
2024-03-27 18:58:30 +01:00
" <td>0.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1.0</td>\n",
" <td>0.787852</td>\n",
" <td>4</td>\n",
2024-03-27 18:58:30 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-03-28 11:37:23 +01:00
"<p>96096 rows × 21 columns</p>\n",
2024-03-27 18:58:30 +01:00
"</div>"
],
"text/plain": [
2024-03-28 11:37:23 +01:00
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" time_between_purchase nb_tickets_internet fidelity ... opt_in \\\n",
"0 0.000000 0.0 1 ... False \n",
"1 0.000000 0.0 2 ... True \n",
"2 0.000000 0.0 2 ... True \n",
"3 0.000000 0.0 1 ... False \n",
"4 363.061678 0.0 4 ... False \n",
"... ... ... ... ... ... \n",
"96091 0.000000 1.0 2 ... False \n",
"96092 0.000000 1.0 1 ... False \n",
"96093 -1.000000 0.0 1 ... True \n",
"96094 0.000000 1.0 1 ... False \n",
"96095 -1.000000 0.0 2 ... False \n",
"\n",
" gender_female gender_male gender_other nb_campaigns \\\n",
"0 1 0 0 0.0 \n",
"1 0 1 0 0.0 \n",
"2 1 0 0 0.0 \n",
"3 1 0 0 0.0 \n",
"4 1 0 0 0.0 \n",
"... ... ... ... ... \n",
"96091 0 1 0 15.0 \n",
"96092 0 1 0 12.0 \n",
"96093 1 0 0 29.0 \n",
"96094 0 1 0 20.0 \n",
"96095 0 1 0 31.0 \n",
"\n",
" nb_campaigns_opened has_purchased has_purchased_estim score \\\n",
"0 0.0 0.0 0.0 0.006066 \n",
"1 0.0 1.0 0.0 0.288847 \n",
"2 0.0 0.0 0.0 0.103264 \n",
"3 0.0 0.0 0.0 0.008928 \n",
"4 0.0 1.0 1.0 0.992809 \n",
"... ... ... ... ... \n",
"96091 5.0 1.0 0.0 0.351762 \n",
"96092 9.0 0.0 1.0 0.567814 \n",
"96093 3.0 0.0 0.0 0.004652 \n",
"96094 4.0 0.0 0.0 0.293042 \n",
"96095 4.0 0.0 1.0 0.787852 \n",
"\n",
" quartile \n",
"0 1 \n",
"1 2 \n",
"2 1 \n",
"3 1 \n",
"4 4 \n",
"... ... \n",
"96091 2 \n",
"96092 3 \n",
"96093 1 \n",
"96094 2 \n",
"96095 4 \n",
"\n",
"[96096 rows x 21 columns]"
2024-03-27 18:58:30 +01:00
]
},
2024-03-28 11:37:23 +01:00
"execution_count": 88,
2024-03-27 18:58:30 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-03-28 11:37:23 +01:00
"df_segment(X_test, y_test, model)"
2024-03-27 18:58:30 +01:00
]
},
2024-03-23 00:04:49 +01:00
{
"cell_type": "markdown",
"id": "ad16b8ab-7e01-404b-971e-866e9b9d5aa4",
"metadata": {},
"source": [
"## definition of functions to compute the bias of scores and adjust it \n",
"\n",
"Le biais est calculé de la façon suivante. \n",
"En notant $\\hat{p(x_i)}$ le score calculé (estimé par la modélisation) et $p(x_i)$ le vrai score (sans biais), et $\\beta$ le logarithme du biais, on a : \\\n",
"$\\ln{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}} = \\beta + \\ln{\\frac{p(x_i)}{1-p(x_i)}}$ \\\n",
"$ \\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}} = \\exp(\\beta) . \\frac{p(x_i)}{1-p(x_i)} $ , soit : \\\n",
"$p(x_i) = {\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}}$ \\\n",
"Ce qu'on appelle biais et qu'on estime dans le code par la suite est : $B=\\exp(\\beta) $. Les probabilités ne sont donc pas biaisées si $B=1$. Il y a surestimation si $B>1$. \n",
"\n",
"On cherche le B qui permette d'ajuster les probabilités de telle sorte que la somme des scores soit égale à la somme des y_has_purchased. Cela revient à résoudre : \n",
"\n",
"\\begin{equation}\n",
"\\sum_{i}{\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}} = \\sum_{i}{Y_i}\n",
"\\end{equation}\n",
"\n",
"C'est ce que fait la fonction find_bias. \n",
"\n",
"Note sur les notations : \\\n",
"$\\hat{p(x_i)}$ correspond à ce qu'on appelle le score et $p(x_i)$ à ce qu'on appellera le score adjusted"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 17,
2024-03-23 00:04:49 +01:00
"id": "f0379536-a6c5-4b16-bde5-d0319ec1b140",
"metadata": {},
"outputs": [],
"source": [
"# compute adjusted score from odd ratios (cf formula above)\n",
"def adjusted_score(odd_ratio, bias) :\n",
" adjusted_score = odd_ratio/(bias+odd_ratio)\n",
" return adjusted_score"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 18,
2024-03-23 00:04:49 +01:00
"id": "32a0dfd0-f49d-4785-a56f-706d381bfe41",
"metadata": {},
"outputs": [],
"source": [
"# when the score is 1 we cannot compute the odd ratio, so we adjust scores equal to 1\n",
"# we set the second best score instead\n",
"\n",
"def adjust_score_1(score) :\n",
" second_best_score = np.array([element for element in score if element !=1]).max()\n",
" new_score = np.array([element if element!=1 else second_best_score for element in score]) \n",
" return new_score"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 19,
2024-03-23 00:04:49 +01:00
"id": "2dff1def-02df-413e-afce-b4aeaf7752b6",
"metadata": {},
"outputs": [],
"source": [
"def odd_ratio(score) :\n",
" return score / (1 - score)"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 20,
2024-03-23 00:04:49 +01:00
"id": "683d71fc-7442-4028-869c-49c57592d6e9",
"metadata": {},
"outputs": [],
"source": [
"# definition of a function that automatically detects the bias\n",
"\n",
"def find_bias(odd_ratios, y_objective, initial_guess=6) :\n",
" \"\"\"\n",
" results = minimize(lambda bias : (sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective)**2 ,\n",
" initial_guess , method = \"BFGS\")\n",
"\n",
" estimated_bias = results.x[0]\n",
" \"\"\"\n",
"\n",
" # faster method\n",
" bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n",
" \n",
" return bias_estimated[0]"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 21,
2024-03-28 11:37:23 +01:00
"id": "f17dc6ca-7a48-441b-8c04-11c47b8b9741",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-04-03 12:36:47 +02:00
"0.31861289893787315 0.14317973692973693\n"
2024-03-28 11:37:23 +01:00
]
},
{
"data": {
"text/plain": [
2024-04-03 12:36:47 +02:00
"0.14310053386734936"
2024-03-28 11:37:23 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 21,
2024-03-28 11:37:23 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(X_test_segment[\"score\"].mean(), y_test[\"y_has_purchased\"].mean())\n",
"y_train[\"y_has_purchased\"].mean()"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 22,
2024-03-23 00:04:49 +01:00
"id": "781b0d40-c954-4c54-830a-e709c8667328",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2024-04-03 12:36:47 +02:00
"5.939748066330849"
2024-03-23 00:04:49 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 22,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# computation with the function defined\n",
"\n",
"bias_test_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_test_segment[\"score\"])), \n",
" y_objective = y_test[\"y_has_purchased\"].sum(),\n",
" initial_guess=6)\n",
"bias_test_set"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 23,
2024-03-23 00:04:49 +01:00
"id": "248cb862-418e-4767-9933-70c4885ecf40",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2024-04-03 12:36:47 +02:00
"6.01952986090399"
2024-03-23 00:04:49 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 23,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# comparison with bias of the train set\n",
2024-03-28 11:37:23 +01:00
"X_train_score = model.predict_proba(X_train)[:, 1]\n",
2024-03-23 00:04:49 +01:00
"\n",
"bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), \n",
" y_objective = y_train[\"y_has_purchased\"].sum(),\n",
2024-03-30 12:00:49 +01:00
" initial_guess=10)\n",
2024-03-23 00:04:49 +01:00
"bias_train_set"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 24,
2024-03-23 00:04:49 +01:00
"id": "fff6cbe6-7bb3-4732-9b81-b9ac5383bbcf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-04-03 12:36:47 +02:00
"betâ test - betâ train = -0.013342440676233564\n"
2024-03-23 00:04:49 +01:00
]
}
],
"source": [
"print(\"betâ test - betâ train = \",np.log(bias_test_set/bias_train_set))"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 25,
2024-03-23 00:04:49 +01:00
"id": "f506870d-4a8a-4b2c-8f0b-e0789080b20c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-04-03 12:36:47 +02:00
"mean absolute erreur 0.0009061459618344602\n"
2024-03-23 00:04:49 +01:00
]
}
],
"source": [
"# impact of considering a bias computed on train set instead of test set - totally neglectable\n",
"\n",
"score_adjusted_test = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"\n",
"print(\"mean absolute erreur\",abs(score_adjusted_test-score_adjusted_train).mean())"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 26,
2024-03-23 00:04:49 +01:00
"id": "8213d0e4-063b-49fa-90b7-677fc34f4c01",
"metadata": {},
2024-03-30 12:00:49 +01:00
"outputs": [],
2024-03-23 17:23:59 +01:00
"source": [
"# adjust scores accordingly \n",
2024-03-23 00:04:49 +01:00
"\n",
2024-03-23 17:23:59 +01:00
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"\n",
"# actually, we are not supposed to have X_test, so the biais is estimated on X_train\n",
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"X_test_segment[\"score_adjusted\"] = score_adjusted_train"
2024-03-23 00:04:49 +01:00
]
},
{
2024-03-23 17:23:59 +01:00
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 27,
2024-03-23 17:23:59 +01:00
"id": "834d3723-2e72-4c65-9c62-e2d595c69461",
2024-03-23 00:04:49 +01:00
"metadata": {},
2024-03-23 17:23:59 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-04-03 12:36:47 +02:00
"MSE for score : 0.11809894130837426\n",
"MSE for ajusted score : 0.07434720017843571\n",
"sum of y_has_purchased : 13759.0\n",
"sum of adjusted scores : 13671.922997651252\n"
2024-03-23 17:23:59 +01:00
]
}
],
2024-03-23 00:04:49 +01:00
"source": [
2024-03-23 17:23:59 +01:00
"# check \n",
"\n",
"MSE_score = ((X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"MSE_ajusted_score = ((X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"print(f\"MSE for score : {MSE_score}\")\n",
"print(f\"MSE for ajusted score : {MSE_ajusted_score}\")\n",
"\n",
"print(\"sum of y_has_purchased :\",y_test[\"y_has_purchased\"].sum())\n",
"print(\"sum of adjusted scores :\", X_test_segment[\"score_adjusted\"].sum())"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 28,
2024-03-23 17:23:59 +01:00
"id": "9f30a4dd-a9d8-405a-a7d5-5324ae88cf70",
2024-03-23 00:04:49 +01:00
"metadata": {},
"outputs": [
{
2024-03-23 17:23:59 +01:00
"name": "stdout",
"output_type": "stream",
"text": [
2024-04-03 12:36:47 +02:00
"MAE for score : 0.24742788848313355\n",
"MAE for adjusted score : 0.14205672428104504\n"
2024-03-23 17:23:59 +01:00
]
2024-03-23 00:04:49 +01:00
}
],
"source": [
2024-03-23 17:23:59 +01:00
"# mean absolute error - divided by 2 with out method\n",
"\n",
"MAE_score = abs(X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"MAE_ajusted_score = abs(X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"print(f\"MAE for score : {MAE_score}\")\n",
"print(f\"MAE for adjusted score : {MAE_ajusted_score}\")"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 30,
2024-03-23 17:23:59 +01:00
"id": "6f9396db-e213-408c-a596-eaeec3bc79f3",
2024-03-23 00:04:49 +01:00
"metadata": {},
2024-03-24 11:05:28 +01:00
"outputs": [],
2024-03-23 00:04:49 +01:00
"source": [
2024-03-23 17:23:59 +01:00
"# visualization\n",
"\n",
"# histogramme des probas et des probas ajustées\n",
"\n",
2024-03-24 10:42:44 +01:00
"def plot_hist_scores(df, score, score_adjusted, type_of_activity) :\n",
2024-03-23 17:23:59 +01:00
"\n",
" plt.figure()\n",
" plt.hist(df[score], label = \"score\", alpha=0.6)\n",
" plt.hist(df[score_adjusted], label=\"adjusted score\", alpha=0.6)\n",
" plt.legend()\n",
" plt.xlabel(\"probability of a future purchase\")\n",
" plt.ylabel(\"count\")\n",
2024-03-24 10:42:44 +01:00
" plt.title(f\"Comparison between score and adjusted score for {type_of_activity} companies\")\n",
2024-03-24 11:44:22 +01:00
" # plt.show()"
2024-03-24 10:42:44 +01:00
]
},
{
"cell_type": "code",
2024-03-24 11:44:22 +01:00
"execution_count": 64,
"id": "def64c16-f4dd-493c-909c-d886d7f53947",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'projet-bdc2324-team1/Output_expected_CA/sport/hist_score_adjustedsport.png'"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PATH + file_name + type_of_activity + \".png\""
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 31,
2024-03-24 10:42:44 +01:00
"id": "b478d40d-9677-4204-87bd-16fb0bc1fe9a",
"metadata": {},
"outputs": [
{
"data": {
2024-04-03 12:36:47 +02:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAm8AAAHFCAYAAACkWR6dAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABpc0lEQVR4nO3deVgVZfsH8O9h34/IKoqIG4LgholIiiu44JKZFoU7pqiI4pJvr0nmq7lrWmqWSy5h5fJmKoEbqSgiiiuuoWKCmLIoKiA8vz/8Ma8DiIDgcez7uS6uOs/cM3PPc+acuX1mOSohhAARERERKYKWphMgIiIiorJj8UZERESkICzeiIiIiBSExRsRERGRgrB4IyIiIlIQFm9ERERECsLijYiIiEhBWLwRERERKQiLNyIiIiIFeSOLt9OnT2PIkCFwdHSEgYEBTExM0KJFC8ydOxf37t3TdHpVbvDgwahTp46m0yjR2rVroVKpcPz48UpbZkxMDMLCwpCRkVFpyyRlO3DgAFQqFQ4cOPDK11PVn79NmzZh8eLFVbLsOnXqYPDgwVWy7NfRtWvX0KNHD1SvXh0qlQohISGaTumV2rVrF8LCwjSdhiK1b98e7du319j637jibdWqVXB3d0dcXBwmTZqEiIgIbNu2De+99x5WrFiBYcOGaTrFKjdt2jRs27ZN02m8MjExMfj8889ZvNFroao/f1VZvP3TjB8/HrGxsVi9ejWOHDmC8ePHazqlV2rXrl34/PPPNZ2GIn3zzTf45ptvNLZ+HY2tuQocOXIEo0aNQpcuXbB9+3bo6+tL07p06YLQ0FBERERoMMOq9fDhQxgZGaFevXqaToXeMEIIPH78GIaGhppO5bXHz1/Vy8/Px5MnT2Tf8RVx9uxZtGrVCn369Hmt8qpqhccKqjgXFxeNrv+NGnmbNWsWVCoVvv322xI/PHp6eujVq5f0uqCgAHPnzkWjRo2gr68Pa2trDBw4EDdv3pTN1759e7i6uuLIkSNo06YNDA0NUadOHaxZswYAsHPnTrRo0QJGRkZwc3MrViCGhYVBpVLh5MmT6Nu3L8zMzKBWq/HRRx/hzp07stjNmzfDx8cHNWrUgKGhIZydnfHJJ58gOztbFjd48GCYmJjgzJkz8PHxgampKTp16iRNK3ra5ueff4aHhwfUajWMjIxQt25dDB06VBZz48YNfPTRR7C2toa+vj6cnZ2xYMECFBQUSDHXrl2DSqXC/PnzsXDhQjg6OsLExASenp44evRoaW+PTHp6OoYMGYLq1avD2NgYPXv2xJ9//lksbs+ePejUqRPMzMxgZGQELy8v7N27V9a3kyZNAgA4OjpCpVJJp7EmTZoEtVqN/Px8KX7s2LFQqVSYN2+e1Hb37l1oaWlh6dKlUltWVhYmTpwIR0dH6OnpoWbNmggJCSn2Pggh8M0336BZs2YwNDSEubk5+vXrV2xbCvehuLg4tG3bVnoPvvzyS1n/Pk9Z3r+MjAyEhoaibt260v7cvXt3XLhwQYq5d+8egoKCULNmTejp6aFu3br49NNPkZOTI1uWSqXCmDFjsGLFCjg7O0NfXx/r1q0DAFy+fBn+/v6y/eTrr79+4TYAwNdff4127drB2toaxsbGcHNzw9y5c5GXl1fh/rpw4QK6du0KIyMjWFpaYuTIkbh//36Z8rly5QqGDBmCBg0awMjICDVr1kTPnj1x5syZYrFlXU/Rz1/hZ2bt2rXFYlUqley01Z07dzBixAjY29tDX18fVlZW8PLywp49e6R+2blzJ65fvy7t6yqVSpo/NzcXM2fOlL7TrKysMGTIkGLfM3l5eZg8eTJsbW1hZGSEt99+G8eOHStTnwHA8uXL0bRpU5iYmMDU1BSNGjXCv/71L1nMX3/9JW2Lnp4e7Ozs0K9fP9y+fVuKKc93zty5czFz5kw4OjpCX18f+/fvBwAcP34cvXr1QvXq1WFgYIDmzZvjp59+KjX/wtPdV65cwe7du6V+vHbtWqXlVZIXfY4L89qwYQMmTJgAW1tbGBoawtvbGydPniy2vF9//RWenp4wMjKCqakpunTpgiNHjshiCo8/J06cQL9+/WBubo569eph8ODB0uf22X2psA+eJyIiAp06dZK2wdnZGbNnz65wXqdPn8Z7770HtVqN6tWrY8KECXjy5AkuXryIrl27wtTUFHXq1MHcuXNl85enr44fP473338fderUkY7fH3zwAa5fvy6LK7ysZ//+/Rg1ahQsLS1hYWGBvn374tatW7LYkk6blvXzt2/fPrRv3x4WFhYwNDRE7dq18e677+Lhw4el9r2MeEM8efJEGBkZCQ8PjzLPM2LECAFAjBkzRkRERIgVK1YIKysrYW9vL+7cuSPFeXt7CwsLC+Hk5CS+//578fvvvws/Pz8BQHz++efCzc1N/Pjjj2LXrl2idevWQl9fX/z111/S/NOnTxcAhIODg5g0aZL4/fffxcKFC4WxsbFo3ry5yM3NlWK/+OILsWjRIrFz505x4MABsWLFCuHo6Cg6dOggy33QoEFCV1dX1KlTR8yePVvs3btX/P7779I0BwcHKTYmJkaoVCrx/vvvi127dol9+/aJNWvWiICAACkmLS1N1KxZU1hZWYkVK1aIiIgIMWbMGAFAjBo1SopLSkoSAESdOnVE165dxfbt28X27duFm5ubMDc3FxkZGaX2+Zo1awQAYW9vL4YOHSp2794tvv32W2FtbS3s7e1Fenq6FLt+/XqhUqlEnz59xNatW8WOHTuEn5+f0NbWFnv27BFCCJGcnCzGjh0rAIitW7eKI0eOiCNHjojMzEwREREhAIiYmBhpmY0aNRKGhoaiS5cuUtvmzZsFAHH+/HkhhBDZ2dmiWbNmwtLSUixcuFDs2bNHLFmyRKjVatGxY0dRUFAgzRsYGCh0dXVFaGioiIiIEJs2bRKNGjUSNjY2IjU1tdg+1KBBA7FixQoRFRUlgoKCBACxbt26UvusLO9fVlaWaNy4sTA2NhYzZswQv//+u9iyZYsYN26c2LdvnxBCiEePHokmTZoIY2NjMX/+fBEZGSmmTZsmdHR0RPfu3WXrBCBq1qwpmjRpIjZt2iT27dsnzp49K86dOyfUarVwc3MTP/zwg4iMjBShoaFCS0tLhIWFlbodQggxfvx4sXz5chERESH27dsnFi1aJCwtLcWQIUNkcWXtr9TUVGFtbS1q1qwp1qxZI3bt2iU+/PBDUbt2bQFA7N+/v9R8oqOjRWhoqPjll19EdHS02LZtm+jTp48wNDQUFy5cqNB6in7+Cj8za9asKbZ+AGL69OnSa19fX2FlZSW+/fZbceDAAbF9+3bx2WefifDwcCGEEOfOnRNeXl7C1tZW2tePHDkihBAiPz9fdO3aVRgbG4vPP/9cREVFie+++07UrFlTuLi4iIcPH8pyVKlUYtKkSSIyMlIsXLhQ1KxZU5iZmYlBgwaV2mc//vijACDGjh0rIiMjxZ49e8SKFStEcHCwFHPz5k1Ro0YN2Wdo8+bNYujQoSIxMVEIUf7vnJo1a4oOHTqIX375RURGRoqkpCSxb98+oaenJ9q2bSs2b94sIiIixODBg5/b34UyMzPFkSNHhK2trfDy8pL68fHjx5WSV0nK8jnev3+/9P3Yu3dvsWPHDrFhwwZRv359YWZmJq5evSrFbty4UQAQPj4+Yvv27WLz5s3C3d1d6OnpiYMHD0pxzx5/pkyZIqKiosT27dvFlStXRL9+/QQA2b70+PHj5/bbd999J1QqlWjfvr3YtGmT2LNnj/jmm29EUFBQhfNycnISX3zxhYiKihKTJ0+WjsmNGjUSX331lYiKihJDhgwRAMSWLVsq1Fc///yz+Oyzz8S2bdtEdHS0CA8PF97e3sLKykp2rC88PtWtW1eMHTtW/P777+K7774T5ubmxY7B3t7ewtvbW3pd1s9fUlKSMDAwEF26dBHbt28XBw4cEBs3bhQBAQGy49+LvDHFW2pqqgAg3n///TLFJyYmCgCynU4IIWJjYwUA8a9//Utq8/b2FgDE8ePHpba7d+8KbW1tYWhoKCvUEhISBAD
2024-03-24 10:42:44 +01:00
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
2024-03-24 11:44:22 +01:00
"plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\", type_of_activity = type_of_activity)"
2024-03-23 00:04:49 +01:00
]
},
{
2024-03-24 10:42:44 +01:00
"cell_type": "code",
2024-03-24 11:05:28 +01:00
"execution_count": 40,
2024-03-24 10:42:44 +01:00
"id": "add631d7-0757-45a5-bb5b-f7f4b4baa961",
2024-03-23 00:04:49 +01:00
"metadata": {},
2024-03-24 10:42:44 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"projet-bdc2324-team1/Output_expected_CA/sport/\n"
]
}
],
2024-03-23 00:04:49 +01:00
"source": [
2024-03-24 10:42:44 +01:00
"# define path so save graphics\n",
"\n",
"# define type of activity \n",
"type_of_activity = \"sport\"\n",
"PATH = f\"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/\"\n",
"print(PATH)"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-03-24 11:44:22 +01:00
"execution_count": 68,
2024-03-24 10:42:44 +01:00
"id": "3a5b5bd9-e033-4436-8c56-bf5fb61df87f",
2024-03-23 00:04:49 +01:00
"metadata": {},
2024-03-24 11:44:22 +01:00
"outputs": [],
2024-03-23 17:23:59 +01:00
"source": [
2024-03-24 10:42:44 +01:00
"# export png \n",
"\n",
"# plot adjusted scores and save (to be tested)\n",
"plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\", type_of_activity = type_of_activity)\n",
"\n",
"image_buffer = io.BytesIO()\n",
"plt.savefig(image_buffer, format='png')\n",
"image_buffer.seek(0)\n",
2024-03-24 11:44:22 +01:00
"file_name = \"hist_score_adjusted_\"\n",
2024-03-24 10:42:44 +01:00
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".png\"\n",
"with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:\n",
" s3_file.write(image_buffer.read())\n",
"plt.close()"
2024-03-23 17:23:59 +01:00
]
},
2024-03-24 10:42:44 +01:00
{
"cell_type": "markdown",
"id": "e6fae260-fab8-4f51-90dc-9b6d7314c77b",
"metadata": {},
"source": [
"## Compute number of tickets and CA by segment with the recalibrated score"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 32,
2024-03-24 10:42:44 +01:00
"id": "90c4c2b5-0ede-4001-889f-749cfbd9df04",
2024-03-23 17:23:59 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
2024-03-23 00:04:49 +01:00
"\n",
2024-03-23 17:23:59 +01:00
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
2024-03-23 00:04:49 +01:00
"\n",
2024-03-23 17:23:59 +01:00
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
2024-03-24 10:42:44 +01:00
" <th>score (%)</th>\n",
" <th>score adjusted (%)</th>\n",
" <th>has purchased (%)</th>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
2024-04-03 12:36:47 +02:00
" <td>10.20</td>\n",
" <td>1.94</td>\n",
" <td>1.19</td>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
2024-04-03 12:36:47 +02:00
" <td>37.08</td>\n",
" <td>9.12</td>\n",
" <td>10.62</td>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
2024-04-03 12:36:47 +02:00
" <td>62.07</td>\n",
" <td>22.00</td>\n",
" <td>28.67</td>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
2024-04-03 12:36:47 +02:00
" <td>90.35</td>\n",
" <td>67.16</td>\n",
" <td>63.09</td>\n",
2024-03-23 17:23:59 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2024-03-24 10:42:44 +01:00
" quartile score (%) score adjusted (%) has purchased (%)\n",
2024-04-03 12:36:47 +02:00
"0 1 10.20 1.94 1.19\n",
"1 2 37.08 9.12 10.62\n",
"2 3 62.07 22.00 28.67\n",
"3 4 90.35 67.16 63.09"
2024-03-23 00:04:49 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 32,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-03-24 10:42:44 +01:00
"X_test_table_adjusted_scores = (100 * X_test_segment.groupby(\"quartile\")[[\"score\",\"score_adjusted\", \"has_purchased\"]].mean()).round(2).reset_index()\n",
"X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f\"{col.replace('_', ' ')} (%)\" for col in X_test_table_adjusted_scores.columns if col in [\"score\",\"score_adjusted\", \"has_purchased\"]})\n",
"X_test_table_adjusted_scores"
2024-03-23 00:04:49 +01:00
]
},
{
"cell_type": "code",
2024-03-24 10:42:44 +01:00
"execution_count": 162,
"id": "d0b8740c-cf48-4a3e-83cb-23d95059f62f",
2024-03-23 00:04:49 +01:00
"metadata": {},
"outputs": [
{
2024-03-23 17:23:59 +01:00
"data": {
"text/plain": [
2024-03-24 10:42:44 +01:00
"'\\\\begin{tabular}{lrrr}\\n\\\\toprule\\nquartile & score (%) & score adjusted (%) & has purchased (%) \\\\\\\\\\n\\\\midrule\\n1 & 13.250000 & 2.510000 & 1.570000 \\\\\\\\\\n2 & 33.890000 & 8.000000 & 9.850000 \\\\\\\\\\n3 & 63.060000 & 22.580000 & 21.470000 \\\\\\\\\\n4 & 90.520000 & 66.200000 & 65.010000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'"
2024-03-23 17:23:59 +01:00
]
},
2024-03-24 10:42:44 +01:00
"execution_count": 162,
2024-03-23 17:23:59 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-03-24 10:42:44 +01:00
"X_test_table_adjusted_scores.to_latex(index=False)"
]
},
{
"cell_type": "code",
2024-03-24 11:05:28 +01:00
"execution_count": 43,
2024-03-24 10:42:44 +01:00
"id": "d6a04d3e-c454-43e4-ae4c-0746e928575b",
"metadata": {},
2024-03-24 11:05:28 +01:00
"outputs": [],
2024-03-24 10:42:44 +01:00
"source": [
"# comparison between score and adjusted score - export csv associated\n",
"\n",
2024-03-26 12:20:03 +01:00
"file_name = \"table_adjusted_score_\"\n",
2024-03-24 10:42:44 +01:00
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" X_test_table_adjusted_scores.to_csv(file_out, index = False)"
2024-03-23 17:23:59 +01:00
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 129,
2024-03-23 17:23:59 +01:00
"id": "a974589f-7952-4db2-bebf-7b69c6b09372",
"metadata": {},
"outputs": [],
"source": [
2024-03-26 12:20:03 +01:00
"def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :\n",
2024-03-23 17:23:59 +01:00
" \n",
" duration_ratio = duration_ref/duration_projection\n",
"\n",
" df_output = df\n",
2024-04-03 12:36:47 +02:00
" \n",
" # project number of tickets : at least 1 ticket purchased if the customer purchased\n",
" df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets].apply(lambda x : max(1, x /duration_ratio))\n",
"\n",
" # project amount : if the customer buys a ticket, we expect the amount to be at least the average price of tickets \n",
" # for customers purchasing exactly one ticket\n",
" if df_output.loc[df_output[nb_tickets]==1].shape[0] > 0 :\n",
" avg_price = df_output.loc[df_output[nb_tickets]==1][total_amount].mean()\n",
" else :\n",
" avg_price = df_output[total_amount].mean()\n",
" # df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n",
" # df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount].apply(lambda x : max(avg_ticket_price, x/duration_ratio))\n",
2024-03-23 17:23:59 +01:00
"\n",
2024-04-03 12:36:47 +02:00
" # we compute the avg price of ticket for each customer\n",
" df_output[\"avg_ticket_price\"] = df_output[total_amount]/df_output[nb_tickets]\n",
"\n",
" # correct negatives total amounts\n",
" df_output.loc[:,\"total_amount_corrected\"] = np.where(df_output[total_amount] < 0, \n",
" avg_price * df_output[nb_tickets],\n",
" df_output[total_amount])\n",
" \n",
" df_output.loc[:,\"total_amount_projected\"] = np.where(\n",
" # if no ticket bought in the past, we take the average price\n",
" df_output[nb_tickets]==0, avg_price,\n",
" # if avg prices of tickets are negative, we recompute the expected amount based on the avg price of a ticket\n",
" # observed on the whole population\n",
" np.where(X_test_segment[\"avg_ticket_price\"] < 0, avg_price * df_output.loc[:,\"nb_tickets_projected\"],\n",
" # else, the amount projected is the average price of tickets bought by the customer * nb tickets projected\n",
" df_output[\"avg_ticket_price\"] * df_output.loc[:,\"nb_tickets_projected\"])\n",
" )\n",
2024-03-23 17:23:59 +01:00
" \n",
" df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n",
" df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n",
"\n",
2024-03-26 12:20:03 +01:00
" df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n",
" \n",
2024-03-23 17:23:59 +01:00
" return df_output\n"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 120,
"id": "87fb8e1c-3567-46df-9e98-197b7ca3becd",
2024-03-23 17:23:59 +01:00
"metadata": {},
"outputs": [
2024-03-23 00:04:49 +01:00
{
"data": {
2024-04-03 12:36:47 +02:00
"text/plain": [
"array([25., 92., 45., ..., 0., 0., 0.])"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.where(X_test_segment[\"total_amount\"] < 0, avg_price * X_test_segment[\"nb_tickets\"],\n",
" X_test_segment[\"total_amount\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "dc0cdf9c-c55c-4085-80a6-c2131bb22ad4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 25.00\n",
"1 92.00\n",
"2 45.00\n",
"3 10.00\n",
"4 127.00\n",
" ... \n",
"96091 0.00\n",
"96092 100.89\n",
"96093 0.00\n",
"96094 0.00\n",
"96095 0.00\n",
"Name: total_amount, Length: 96096, dtype: float64"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
" X_test_segment[\"total_amount\"]"
]
},
{
"cell_type": "code",
"execution_count": 105,
"id": "51455654-e6de-4608-8fbe-594d7fcd5b53",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0, 98)"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment.loc[X_test_segment[\"nb_tickets\"]==-1].shape[0°"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "a0d08a46-93d0-425a-9a56-28cf8bfd93e9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 4.410500e+04\n",
"mean 4.640310e+02\n",
"std 1.049793e+04\n",
"min -2.064700e+04\n",
"25% 3.000000e+01\n",
"50% 6.900000e+01\n",
"75% 1.339900e+02\n",
"max 1.209751e+06\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"duration_ratio = 17/12\n",
"X_test_segment.loc[X_test_segment[\"nb_tickets\"]>0][\"total_amount\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "dc7de319-6d22-44f0-9e58-492088b0dd5f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 183.851977\n",
"std 5021.379770\n",
"min 48.713098\n",
"25% 48.713098\n",
"50% 48.713098\n",
"75% 48.713098\n",
"max 853942.164706\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avg_price = X_test_segment.loc[X_test_segment[\"nb_tickets\"]==1][\"total_amount\"].mean()\n",
"X_test_segment[\"total_amount\"].apply(lambda x : max(avg_price, x/duration_ratio)).describe()"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "8aa50962-067b-493a-8766-258547da8bcd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 150.335598\n",
"std 5022.896337\n",
"min -14574.352941\n",
"25% 0.000000\n",
"50% 0.000000\n",
"75% 42.352941\n",
"max 853942.164706\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[\"total_amount\"].apply(lambda x : x/duration_ratio).describe()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "f2f04205-7b8b-4978-9b4f-1c83034628fe",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1.411765\n",
"1 1.411765\n",
"2 2.117647\n",
"3 0.705882\n",
"4 5.647059\n",
" ... \n",
"96091 0.000000\n",
"96092 1.411765\n",
"96093 0.000000\n",
"96094 0.000000\n",
"96095 0.000000\n",
"Name: nb_tickets, Length: 96096, dtype: float64"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment.loc[:,\"nb_tickets\"]/duration_ratio"
]
},
{
"cell_type": "code",
"execution_count": 81,
"id": "140e09b9-f6b8-4075-b380-86851e1596f1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 176.690937\n",
"std 5022.166115\n",
"min -14574.352941\n",
"25% 48.713098\n",
"50% 48.713098\n",
"75% 48.713098\n",
"max 853942.164706\n",
"dtype: float64"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.Series(np.where(X_test_segment[\"nb_tickets\"]==0, avg_price, X_test_segment[\"nb_tickets_projected\"] * X_test_segment[\"total_amount\"]/X_test_segment[\"nb_tickets\"])).describe()"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "b2c8c7dd-9cd2-40b8-945f-0daf27b3b66b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 162.000000\n",
"mean 51.283951\n",
"std 135.183724\n",
"min 1.000000\n",
"25% 2.000000\n",
"50% 6.000000\n",
"75% 31.500000\n",
"max 1038.000000\n",
"Name: nb_tickets, dtype: float64"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[X_test_segment[\"total_amount\"]<0][\"nb_tickets\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "44ce62e3-fae6-4192-b8dd-386fd84fed22",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 44105.000000\n",
"mean 35.661188\n",
"std 71.477667\n",
"min -216.368182\n",
"25% 10.000000\n",
"50% 25.000000\n",
"75% 48.720000\n",
"max 4000.000000\n",
"Name: avg_ticket_price, dtype: float64"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# code pr projet revenue\n",
"\n",
"X_test_segment[\"avg_ticket_price\"] = X_test_segment[\"total_amount\"]/X_test_segment[\"nb_tickets\"]\n",
"X_test_segment[\"avg_ticket_price\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "e1c0671a-2b5f-48bf-b964-6bee8a4223ac",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 180.394197\n",
"std 5025.591726\n",
"min 0.000000\n",
"25% 48.713098\n",
"50% 48.713098\n",
"75% 48.713098\n",
"max 853942.164706\n",
"dtype: float64"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.Series(\n",
" np.where(X_test_segment[\"nb_tickets\"]==0, avg_price,\n",
" \n",
" np.where(X_test_segment[\"avg_ticket_price\"] < 0, avg_price * X_test_segment[\"nb_tickets\"] / duration_ratio,\n",
" X_test_segment[\"avg_ticket_price\"] * X_test_segment[\"nb_tickets\"] / duration_ratio)\n",
" )\n",
").describe()"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "6c1e0649-3be1-4754-a86c-24b46a12d523",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 5058.000000\n",
"mean 13.671807\n",
"std 155.341970\n",
"min 1.000000\n",
"25% 1.000000\n",
"50% 2.000000\n",
"75% 4.000000\n",
"max 8250.000000\n",
"Name: nb_tickets, dtype: float64"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[X_test_segment[\"avg_ticket_price\"] == 0][\"nb_tickets\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a4d1b0a-fe16-49e7-9b61-d822d2ed062a",
"metadata": {},
"outputs": [],
"source": [
"df['colonne2'] = np.where(df['colonne1'] > seuil2, df['colonne2'] * 2, # Si colonne1 > seuil2\n",
" np.where(df['colonne1'] > seuil1, df['colonne2'] + 1, df['colonne2'])) "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa87726a-dee2-4b15-af2d-b22583a9eb53",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 132,
"id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>street_id</th>\n",
" <th>structure_id</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>fidelity</th>\n",
" <th>tenant_id</th>\n",
" <th>is_partner</th>\n",
" <th>deleted_at</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>score_adjusted</th>\n",
" <th>nb_tickets_projected</th>\n",
" <th>total_amount_projected</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" <th>pace_purchase</th>\n",
" <th>avg_ticket_price</th>\n",
" <th>total_amount_corrected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5_4317407</td>\n",
" <td>969908</td>\n",
" <td>NaN</td>\n",
" <td>6156473.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>1</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1771</td>\n",
" <td>False</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>True</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.445019</td>\n",
" <td>2</td>\n",
" <td>0.117551</td>\n",
" <td>1.411765</td>\n",
" <td>17.647059</td>\n",
" <td>0.165955</td>\n",
" <td>2.074432</td>\n",
" <td>17.000000</td>\n",
" <td>12.500</td>\n",
" <td>25.00</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>1</th>\n",
" <td>5_477635</td>\n",
" <td>109121</td>\n",
" <td>NaN</td>\n",
" <td>6213652.0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>2</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1771</td>\n",
" <td>False</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>True</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>...</td>\n",
" <td>0.382586</td>\n",
" <td>2</td>\n",
" <td>0.093333</td>\n",
" <td>1.411765</td>\n",
" <td>64.941176</td>\n",
" <td>0.131765</td>\n",
" <td>6.061181</td>\n",
" <td>8.500000</td>\n",
" <td>46.000</td>\n",
" <td>92.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5_411639</td>\n",
" <td>92929</td>\n",
" <td>NaN</td>\n",
" <td>6160271.0</td>\n",
" <td>4</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.916747</td>\n",
" <td>4</td>\n",
" <td>0.646556</td>\n",
" <td>2.117647</td>\n",
" <td>31.764706</td>\n",
" <td>1.369178</td>\n",
" <td>20.537670</td>\n",
" <td>5.666667</td>\n",
" <td>15.000</td>\n",
" <td>45.00</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_326623</td>\n",
" <td>79862</td>\n",
" <td>NaN</td>\n",
" <td>6140109.0</td>\n",
" <td>1</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
2024-03-28 11:37:23 +01:00
" <td>1</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.090534</td>\n",
2024-03-27 18:58:30 +01:00
" <td>1</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.016268</td>\n",
" <td>1.000000</td>\n",
" <td>10.000000</td>\n",
" <td>0.016268</td>\n",
" <td>0.162683</td>\n",
" <td>17.000000</td>\n",
" <td>10.000</td>\n",
" <td>10.00</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2024-04-03 12:36:47 +02:00
" <td>5_383915</td>\n",
" <td>85421</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>6149409.0</td>\n",
" <td>2</td>\n",
" <td>1771</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.346571</td>\n",
" <td>2</td>\n",
" <td>0.080976</td>\n",
" <td>5.647059</td>\n",
" <td>89.647059</td>\n",
" <td>0.457279</td>\n",
" <td>7.259298</td>\n",
" <td>8.500000</td>\n",
" <td>15.875</td>\n",
" <td>127.00</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96091</th>\n",
" <td>9_91205</td>\n",
" <td>76215</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>47280.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1490</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
2024-03-27 18:58:30 +01:00
" <td>...</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.014966</td>\n",
" <td>1</td>\n",
" <td>0.002518</td>\n",
" <td>1.000000</td>\n",
" <td>48.713098</td>\n",
" <td>0.002518</td>\n",
" <td>0.122642</td>\n",
" <td>NaN</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.00</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96092</th>\n",
" <td>9_369887</td>\n",
" <td>815891</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>30764537.0</td>\n",
" <td>4</td>\n",
" <td>1490</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.834257</td>\n",
" <td>4</td>\n",
" <td>0.455392</td>\n",
" <td>1.411765</td>\n",
" <td>71.216471</td>\n",
" <td>0.642906</td>\n",
" <td>32.431379</td>\n",
" <td>8.500000</td>\n",
" <td>50.445</td>\n",
" <td>100.89</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96093</th>\n",
" <td>9_1007562</td>\n",
" <td>1</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0</td>\n",
" <td>1490</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
2024-03-28 11:37:23 +01:00
" <td>...</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.062886</td>\n",
" <td>1</td>\n",
" <td>0.011025</td>\n",
" <td>1.000000</td>\n",
" <td>48.713098</td>\n",
" <td>0.011025</td>\n",
" <td>0.537071</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.00</td>\n",
2024-03-28 11:37:23 +01:00
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96094</th>\n",
" <td>9_15037</td>\n",
" <td>12992</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>2213448.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1490</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.068998</td>\n",
" <td>1</td>\n",
" <td>0.012162</td>\n",
" <td>1.000000</td>\n",
" <td>48.713098</td>\n",
" <td>0.012162</td>\n",
" <td>0.592451</td>\n",
" <td>NaN</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.00</td>\n",
2024-03-30 12:00:49 +01:00
" </tr>\n",
" <tr>\n",
2024-04-03 12:36:47 +02:00
" <th>96095</th>\n",
" <td>9_135370</td>\n",
" <td>76215</td>\n",
2024-03-30 12:00:49 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>2164740.0</td>\n",
2024-03-30 12:00:49 +01:00
" <td>0</td>\n",
2024-04-03 12:36:47 +02:00
" <td>1490</td>\n",
2024-03-30 12:00:49 +01:00
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
2024-03-23 00:04:49 +01:00
" <td>...</td>\n",
2024-04-03 12:36:47 +02:00
" <td>0.018486</td>\n",
" <td>1</td>\n",
" <td>0.003119</td>\n",
" <td>1.000000</td>\n",
" <td>48.713098</td>\n",
" <td>0.003119</td>\n",
" <td>0.151938</td>\n",
2024-03-26 12:20:03 +01:00
" <td>NaN</td>\n",
2024-04-03 12:36:47 +02:00
" <td>NaN</td>\n",
" <td>0.00</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-04-03 12:36:47 +02:00
"<p>96096 rows × 99 columns</p>\n",
2024-03-23 00:04:49 +01:00
"</div>"
],
"text/plain": [
2024-04-03 12:36:47 +02:00
" customer_id street_id structure_id mcp_contact_id fidelity \\\n",
"0 5_4317407 969908 NaN 6156473.0 1 \n",
"1 5_477635 109121 NaN 6213652.0 2 \n",
"2 5_411639 92929 NaN 6160271.0 4 \n",
"3 5_326623 79862 NaN 6140109.0 1 \n",
"4 5_383915 85421 NaN 6149409.0 2 \n",
"... ... ... ... ... ... \n",
"96091 9_91205 76215 NaN 47280.0 0 \n",
"96092 9_369887 815891 NaN 30764537.0 4 \n",
"96093 9_1007562 1 NaN NaN 0 \n",
"96094 9_15037 12992 NaN 2213448.0 0 \n",
"96095 9_135370 76215 NaN 2164740.0 0 \n",
"\n",
" tenant_id is_partner deleted_at is_email_true opt_in ... \\\n",
"0 1771 False NaN True 0 ... \n",
"1 1771 False NaN True 0 ... \n",
"2 1771 False NaN True 0 ... \n",
"3 1771 False NaN True 1 ... \n",
"4 1771 False NaN True 1 ... \n",
"... ... ... ... ... ... ... \n",
"96091 1490 False NaN True 1 ... \n",
"96092 1490 False NaN True 0 ... \n",
"96093 1490 False NaN True 0 ... \n",
"96094 1490 False NaN True 1 ... \n",
"96095 1490 False NaN True 1 ... \n",
"\n",
" score quartile score_adjusted nb_tickets_projected \\\n",
"0 0.445019 2 0.117551 1.411765 \n",
"1 0.382586 2 0.093333 1.411765 \n",
"2 0.916747 4 0.646556 2.117647 \n",
"3 0.090534 1 0.016268 1.000000 \n",
"4 0.346571 2 0.080976 5.647059 \n",
"... ... ... ... ... \n",
"96091 0.014966 1 0.002518 1.000000 \n",
"96092 0.834257 4 0.455392 1.411765 \n",
"96093 0.062886 1 0.011025 1.000000 \n",
"96094 0.068998 1 0.012162 1.000000 \n",
"96095 0.018486 1 0.003119 1.000000 \n",
"\n",
" total_amount_projected nb_tickets_expected total_amount_expected \\\n",
"0 17.647059 0.165955 2.074432 \n",
"1 64.941176 0.131765 6.061181 \n",
"2 31.764706 1.369178 20.537670 \n",
"3 10.000000 0.016268 0.162683 \n",
"4 89.647059 0.457279 7.259298 \n",
"... ... ... ... \n",
"96091 48.713098 0.002518 0.122642 \n",
"96092 71.216471 0.642906 32.431379 \n",
"96093 48.713098 0.011025 0.537071 \n",
"96094 48.713098 0.012162 0.592451 \n",
"96095 48.713098 0.003119 0.151938 \n",
"\n",
" pace_purchase avg_ticket_price total_amount_corrected \n",
"0 17.000000 12.500 25.00 \n",
"1 8.500000 46.000 92.00 \n",
"2 5.666667 15.000 45.00 \n",
"3 17.000000 10.000 10.00 \n",
"4 8.500000 15.875 127.00 \n",
"... ... ... ... \n",
"96091 NaN NaN 0.00 \n",
"96092 8.500000 50.445 100.89 \n",
"96093 NaN NaN 0.00 \n",
"96094 NaN NaN 0.00 \n",
"96095 NaN NaN 0.00 \n",
"\n",
"[96096 rows x 99 columns]"
2024-03-23 00:04:49 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 132,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-04-03 12:36:47 +02:00
"# project nb tickets and CA\n",
"\n",
2024-03-26 12:20:03 +01:00
"X_test_segment = project_tickets_CA (X_test_segment, \"nb_purchases\", \"nb_tickets\", \"total_amount\", \"score_adjusted\", \n",
" duration_ref=17, duration_projection=12)\n",
2024-03-23 00:04:49 +01:00
"X_test_segment"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 124,
"id": "22222709-218e-43b5-815f-714dfb776230",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 9.609600e+04\n",
"mean 2.182217e+02\n",
"std 7.120650e+03\n",
"min 0.000000e+00\n",
"25% 0.000000e+00\n",
"50% 0.000000e+00\n",
"75% 6.100000e+01\n",
"max 1.209751e+06\n",
"Name: total_amount_corrected, dtype: float64"
]
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[\"total_amount_corrected\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "73404bdd-e2f2-40e0-8bde-224c460426c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 44105.000000\n",
"mean 35.661188\n",
"std 71.477667\n",
"min -216.368182\n",
"25% 10.000000\n",
"50% 25.000000\n",
"75% 48.720000\n",
"max 4000.000000\n",
"Name: avg_ticket_price, dtype: float64"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[\"avg_ticket_price\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 113,
"id": "f96536d3-fff7-4ccf-be3d-34e671852cd8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.052634865134865136"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(X_test_segment[\"total_amount_projected\"]==0).mean()"
]
},
{
"cell_type": "code",
"execution_count": 115,
"id": "884416e8-edec-4f6b-a40f-1a7c5d653160",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 4.442483\n",
"std 64.952589\n",
"min 1.000000\n",
"25% 1.000000\n",
"50% 1.000000\n",
"75% 1.411765\n",
"max 11472.000000\n",
"Name: nb_tickets_projected, dtype: float64"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[\"nb_tickets_projected\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 35,
2024-03-26 12:20:03 +01:00
"id": "cb66a8ea-65f7-460f-b3fc-ba76a3b91faa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
2024-04-03 12:36:47 +02:00
"1 15.330011\n",
"2 15.314322\n",
"3 14.031588\n",
"4 8.562546\n",
2024-03-26 12:20:03 +01:00
"Name: pace_purchase, dtype: float64"
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 35,
2024-03-26 12:20:03 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment.groupby(\"quartile\")[\"pace_purchase\"].mean()"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 128,
"id": "8a4eec5c-8a4d-4a2b-9afb-1d49c77f78ea",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 162.000000\n",
"mean 3112.018089\n",
"std 8392.717823\n",
"min 51.843098\n",
"25% 161.889295\n",
"50% 395.635139\n",
"75% 2141.696184\n",
"max 69988.895986\n",
"dtype: float64"
]
},
"execution_count": 128,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(X_test[((X_test[\"total_amount_corrected\"] - X_test[\"total_amount\"])>0)][\"total_amount_corrected\"]\n",
" -X_test[((X_test[\"total_amount_corrected\"] - X_test[\"total_amount\"])>0)][\"total_amount\"]) .describe()"
]
},
{
"cell_type": "code",
"execution_count": 118,
2024-03-23 17:23:59 +01:00
"id": "f58f9151-2f91-45df-abb7-1ddcf0652adc",
"metadata": {},
"outputs": [],
"source": [
"# generalization with a function\n",
"\n",
2024-03-26 12:20:03 +01:00
"def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n",
2024-03-27 19:59:05 +01:00
" duration_ref=17, duration_projection=12) :\n",
2024-03-23 17:23:59 +01:00
" \n",
" # compute nb tickets estimated and total amount expected\n",
" df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n",
" \n",
" # number of customers by segment\n",
" df_expected_CA.insert(1, \"size\", df.groupby(segment).size().values)\n",
" \n",
" # size in percent of all customers\n",
" df_expected_CA.insert(2, \"size_perct\", 100 * df_expected_CA[\"size\"]/df_expected_CA[\"size\"].sum())\n",
" \n",
" # compute share of CA recovered\n",
" duration_ratio=duration_ref/duration_projection\n",
" \n",
2024-03-24 10:42:44 +01:00
" df_expected_CA[\"revenue_recovered_perct\"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \\\n",
2024-03-23 17:23:59 +01:00
" df.groupby(segment)[total_amount].sum().values\n",
2024-03-26 12:20:03 +01:00
"\n",
2024-04-03 12:36:47 +02:00
" df_expected_CA[\"share_future_revenue_perct\"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \\\n",
" df[total_amount].sum()\n",
"\n",
2024-03-26 12:20:03 +01:00
" df_drop_null_pace = df.dropna(subset=[pace_purchase])\n",
" df_expected_CA[\"pace_purchase\"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values\n",
2024-03-23 17:23:59 +01:00
" \n",
" return df_expected_CA"
]
},
{
"cell_type": "code",
2024-04-03 12:36:47 +02:00
"execution_count": 133,
2024-03-23 17:23:59 +01:00
"id": "c8df6c80-43e8-4f00-9cd3-eb9022744313",
2024-03-23 00:04:49 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
2024-03-23 10:18:43 +01:00
" <th>size</th>\n",
" <th>size_perct</th>\n",
2024-03-23 00:04:49 +01:00
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
2024-03-24 10:42:44 +01:00
" <th>revenue_recovered_perct</th>\n",
2024-04-03 12:36:47 +02:00
" <th>share_future_revenue_perct</th>\n",
2024-03-26 12:20:03 +01:00
" <th>pace_purchase</th>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
2024-04-03 12:36:47 +02:00
" <td>54123</td>\n",
" <td>56.32</td>\n",
" <td>1480.36</td>\n",
" <td>55345.21</td>\n",
" <td>11.99</td>\n",
" <td>0.37</td>\n",
" <td>15.33</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
2024-04-03 12:36:47 +02:00
" <td>18181</td>\n",
" <td>18.92</td>\n",
" <td>4381.84</td>\n",
" <td>130503.26</td>\n",
" <td>11.65</td>\n",
" <td>0.88</td>\n",
" <td>15.31</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
2024-04-03 12:36:47 +02:00
" <td>11111</td>\n",
" <td>11.56</td>\n",
" <td>8827.97</td>\n",
" <td>285945.50</td>\n",
" <td>24.00</td>\n",
" <td>1.93</td>\n",
" <td>14.03</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
2024-04-03 12:36:47 +02:00
" <td>12681</td>\n",
" <td>13.20</td>\n",
" <td>239758.61</td>\n",
" <td>10313321.91</td>\n",
" <td>85.74</td>\n",
" <td>69.67</td>\n",
" <td>8.56</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile size size_perct nb_tickets_expected total_amount_expected \\\n",
2024-04-03 12:36:47 +02:00
"0 1 54123 56.32 1480.36 55345.21 \n",
"1 2 18181 18.92 4381.84 130503.26 \n",
"2 3 11111 11.56 8827.97 285945.50 \n",
"3 4 12681 13.20 239758.61 10313321.91 \n",
"\n",
" revenue_recovered_perct share_future_revenue_perct pace_purchase \n",
"0 11.99 0.37 15.33 \n",
"1 11.65 0.88 15.31 \n",
"2 24.00 1.93 14.03 \n",
"3 85.74 69.67 8.56 "
2024-03-23 00:04:49 +01:00
]
},
2024-04-03 12:36:47 +02:00
"execution_count": 133,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2024-04-03 12:36:47 +02:00
"\"\"\"\n",
2024-03-26 12:20:03 +01:00
"X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment=\"quartile\", \n",
" nb_tickets_expected=\"nb_tickets_expected\", total_amount_expected=\"total_amount_expected\", \n",
" total_amount=\"total_amount\", pace_purchase=\"pace_purchase\"),2)\n",
2024-04-03 12:36:47 +02:00
" \"\"\"\n",
"X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment=\"quartile\", \n",
" nb_tickets_expected=\"nb_tickets_expected\", total_amount_expected=\"total_amount_expected\", \n",
" total_amount=\"total_amount_corrected\", pace_purchase=\"pace_purchase\"),2)\n",
2024-03-23 00:04:49 +01:00
"X_test_expected_CA"
]
},
2024-04-03 12:36:47 +02:00
{
"cell_type": "code",
"execution_count": null,
"id": "dd25c898-9991-4cc4-8e69-160b61fea0c4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 116,
"id": "63369c2a-a842-4b03-aa11-230287cb3b69",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count 96096.000000\n",
"mean 4.442483\n",
"std 64.952589\n",
"min 1.000000\n",
"25% 1.000000\n",
"50% 1.000000\n",
"75% 1.411765\n",
"max 11472.000000\n",
"Name: nb_tickets_projected, dtype: float64\n"
]
},
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 2.647860\n",
"std 59.108910\n",
"min 0.001335\n",
"25% 0.015281\n",
"50% 0.044399\n",
"75% 0.230742\n",
"max 11450.589975\n",
"Name: nb_tickets_expected, dtype: float64"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(X_test_segment[\"nb_tickets_projected\"].describe())\n",
"X_test_segment[\"nb_tickets_expected\"].describe()\n"
]
},
{
"cell_type": "code",
"execution_count": 117,
"id": "72af97dc-8558-4591-adcf-ad404c9cb3f2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
"1 0.029070\n",
"2 0.074526\n",
"3 0.078737\n",
"4 0.817668\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 117,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we can recover share future revenue by multipling the share of amount by quartile * revenue recovered\n",
"X_test_segment.groupby(\"quartile\")[\"total_amount\"].sum()/X_test_segment[\"total_amount\"].sum()"
]
},
2024-03-23 10:18:43 +01:00
{
"cell_type": "code",
2024-03-27 18:58:30 +01:00
"execution_count": 64,
2024-03-23 17:23:59 +01:00
"id": "ac706ed7-defa-4df1-82e1-06f12fc1b6ad",
2024-03-23 10:18:43 +01:00
"metadata": {},
2024-03-23 17:23:59 +01:00
"outputs": [
{
"data": {
"text/plain": [
2024-03-27 18:58:30 +01:00
"'\\\\begin{tabular}{lrrrrrr}\\n\\\\toprule\\nquartile & size & size (%) & nb tickets expected & total amount expected & revenue recovered (%) & pace purchase \\\\\\\\\\n\\\\midrule\\n1 & 53626 & 35.310000 & 398.260000 & 13949.330000 & 2.350000 & 16.480000 \\\\\\\\\\n2 & 55974 & 36.860000 & 3113.770000 & 101639.450000 & 6.240000 & 16.470000 \\\\\\\\\\n3 & 30435 & 20.040000 & 6214.350000 & 208267.220000 & 14.270000 & 15.710000 \\\\\\\\\\n4 & 11839 & 7.800000 & 72929.460000 & 1835702.430000 & 75.380000 & 11.480000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'"
2024-03-23 17:23:59 +01:00
]
},
2024-03-27 18:58:30 +01:00
"execution_count": 64,
2024-03-23 17:23:59 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
2024-03-23 10:18:43 +01:00
"source": [
2024-03-24 10:42:44 +01:00
"# Création du dictionnaire de mapping pour les noms de colonnes\n",
"mapping_dict = {col: col.replace(\"perct\", \"(%)\").replace(\"_\", \" \") for col in X_test_expected_CA.columns}\n",
"\n",
"X_test_expected_CA.rename(columns=mapping_dict).to_latex(index=False)"
]
},
{
"cell_type": "code",
2024-03-26 12:20:03 +01:00
"execution_count": 122,
2024-03-24 10:42:44 +01:00
"id": "771da0cf-c49f-4e7e-b52f-ebcfb0fb2df3",
"metadata": {},
"outputs": [],
"source": [
"# export summary table to the MinIO storage\n",
"\n",
2024-03-26 12:20:03 +01:00
"file_name = \"table_expected_CA_\"\n",
2024-03-24 10:42:44 +01:00
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" X_test_expected_CA.to_csv(file_out, index = False)"
2024-03-23 10:18:43 +01:00
]
},
2024-03-27 18:58:30 +01:00
{
"cell_type": "code",
"execution_count": 53,
"id": "c805dc10-4d07-4f7d-a677-5461a92845d7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'projet-bdc2324-team1/Output_expected_CA/musique/table_expected_CA_musique.csv'"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PATH = f\"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/\"\n",
"file_name = \"table_expected_CA_\"\n",
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n",
"FILE_PATH_OUT_S3"
]
},
{
"cell_type": "markdown",
"id": "e35ccfff-1845-41f0-9bde-f09b09b67877",
"metadata": {},
"source": [
"## Test : vizu tables saved"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "4e9e88e4-ea10-41f4-9bf1-20b55269a20d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
" <th>score (%)</th>\n",
" <th>score adjusted (%)</th>\n",
" <th>has purchased (%)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>13.25</td>\n",
" <td>2.51</td>\n",
" <td>1.57</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>33.89</td>\n",
" <td>8.00</td>\n",
" <td>9.85</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>63.06</td>\n",
" <td>22.58</td>\n",
" <td>21.47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>90.52</td>\n",
" <td>66.20</td>\n",
" <td>65.01</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile score (%) score adjusted (%) has purchased (%)\n",
"0 1 13.25 2.51 1.57\n",
"1 2 33.89 8.00 9.85\n",
"2 3 63.06 22.58 21.47\n",
"3 4 90.52 66.20 65.01"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = 'projet-bdc2324-team1/Output_expected_CA/sport/table_adjusted_scoresport.csv'\n",
"\n",
"with fs.open( path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
"df"
]
},
2024-03-23 00:04:49 +01:00
{
"cell_type": "markdown",
"id": "9c471bdd-25c2-420a-a8a1-3add9f003cbc",
"metadata": {},
"source": [
"## Just to try, same computation with score instead of score adjusted\n",
"\n",
"seems overestimated : if only 14% of customers come back, how can we recover 22% of the revenue from the segment that is least likely to buy ?? ..."
]
},
{
"cell_type": "code",
2024-03-23 17:23:59 +01:00
"execution_count": 80,
2024-03-23 00:04:49 +01:00
"id": "53684a24-1809-465f-8e21-b9295e34582a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
" <th>size</th>\n",
" <th>size_perct</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" <th>perct_revenue_recovered</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>37410</td>\n",
2024-03-23 17:23:59 +01:00
" <td>38.93</td>\n",
" <td>419.76</td>\n",
" <td>9245.08</td>\n",
" <td>21.71</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>29517</td>\n",
2024-03-23 17:23:59 +01:00
" <td>30.72</td>\n",
" <td>11549.06</td>\n",
" <td>296522.02</td>\n",
" <td>39.24</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>20137</td>\n",
2024-03-23 17:23:59 +01:00
" <td>20.96</td>\n",
" <td>29997.85</td>\n",
" <td>954751.91</td>\n",
" <td>63.34</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>9032</td>\n",
2024-03-23 17:23:59 +01:00
" <td>9.40</td>\n",
" <td>244655.82</td>\n",
" <td>10736011.95</td>\n",
" <td>97.72</td>\n",
2024-03-23 00:04:49 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile size size_perct nb_tickets_expected total_amount_expected \\\n",
2024-03-23 17:23:59 +01:00
"0 1 37410 38.93 419.76 9245.08 \n",
"1 2 29517 30.72 11549.06 296522.02 \n",
"2 3 20137 20.96 29997.85 954751.91 \n",
"3 4 9032 9.40 244655.82 10736011.95 \n",
2024-03-23 00:04:49 +01:00
"\n",
" perct_revenue_recovered \n",
2024-03-23 17:23:59 +01:00
"0 21.71 \n",
"1 39.24 \n",
"2 63.34 \n",
"3 97.72 "
2024-03-23 00:04:49 +01:00
]
},
2024-03-23 17:23:59 +01:00
"execution_count": 80,
2024-03-23 00:04:49 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment_bis = project_tickets_CA (X_test_segment, \"nb_tickets\", \"total_amount\", \"score\", duration_ref=1.5, duration_projection=1)\n",
"\n",
2024-03-23 17:23:59 +01:00
"X_test_expected_CA_bis = round(summary_expected_CA(df=X_test_segment_bis, segment=\"quartile\", nb_tickets_expected=\"nb_tickets_expected\", \n",
" total_amount_expected=\"total_amount_expected\", total_amount=\"total_amount\"),2)\n",
2024-03-23 00:04:49 +01:00
"\n",
"X_test_expected_CA_bis"
]
},
{
"cell_type": "code",
2024-03-23 17:23:59 +01:00
"execution_count": 81,
2024-03-23 00:04:49 +01:00
"id": "7dc66d1e-da03-4513-96e4-d9a43ac0a2c8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"overall share of revenue recovered : 90.26 %\n"
]
}
],
"source": [
"print(\"overall share of revenue recovered : \", round(100 * duration_ratio * X_test_expected_CA_bis[\"total_amount_expected\"].sum() / \\\n",
"X_test_segment_bis[\"total_amount\"].sum(),2), \"%\")"
]
},
{
"cell_type": "markdown",
"id": "673f2969-7b9a-44c1-abf5-5679fca877ce",
"metadata": {},
"source": [
"## Last pieces of analysis"
]
},
{
"cell_type": "code",
"execution_count": 161,
"id": "2365bb13-0f3f-49d5-bf91-52c92abebcee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"overall share of revenue recovered : 77.64%\n"
]
}
],
"source": [
"# global revenue recovered\n",
"global_revenue_recovered = round(100 * duration_ratio * X_test_expected_CA[\"total_amount_expected\"].sum() / \\\n",
"X_test_segment[\"total_amount\"].sum(),2)\n",
"print(f\"overall share of revenue recovered : {global_revenue_recovered}%\")"
]
},
{
"cell_type": "code",
"execution_count": 163,
"id": "16b17f35-57dd-459a-8989-129143dc0952",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.018093\n",
"1 0.721519\n",
"2 3.336101\n",
"3 95.924287\n",
"Name: total_amount_expected, dtype: float64"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"100 * X_test_expected_CA[\"total_amount_expected\"]/X_test_expected_CA[\"total_amount_expected\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": 166,
"id": "dee4a200-eefe-4377-8e80-59ad33edd3c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
"1 0.320407\n",
"2 5.685020\n",
"3 11.339715\n",
"4 82.654858\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 166,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# le segment 4 représente 83% du CA actuel et 96% du CA lié aux anciens clients pour l'année prochaine\n",
"100 * X_test_segment.groupby(\"quartile\")[\"total_amount\"].sum()/X_test_segment[\"total_amount\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": 177,
"id": "c1e6f020-ef18-40b4-bfc1-19f98cb2796e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 207.475735\n",
"std 4720.046248\n",
"min -48831.800000\n",
"25% 0.000000\n",
"50% 0.000000\n",
"75% 60.000000\n",
"max 624890.000000\n",
"Name: total_amount, dtype: float64"
]
},
"execution_count": 177,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[\"total_amount\"].describe() # total amount négatif ???\n"
]
},
{
"cell_type": "code",
"execution_count": 184,
"id": "d301a50e-7c68-40f0-9245-a4eea64c387b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 -4.883180e+04\n",
"1 -6.483180e+04\n",
"2 -7.683860e+04\n",
"3 -8.683860e+04\n",
"4 -9.683860e+04\n",
" ... \n",
"96091 1.802247e+07\n",
"96092 1.839238e+07\n",
"96093 1.877219e+07\n",
"96094 1.931270e+07\n",
"96095 1.993759e+07\n",
"Name: total_amount, Length: 96096, dtype: float64"
]
},
"execution_count": 184,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.cumsum(X_test_segment[\"total_amount\"].sort_values()).reset_index()[\"total_amount\"]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}