BDC-team-1/Sport/Modelization/3_model_cv_sport+CA.ipynb

18752 lines
1.3 MiB
Plaintext
Raw Normal View History

2024-03-22 10:15:59 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "ff8cc602-e733-4a31-bf46-a31087511fe0",
"metadata": {},
"source": [
"# Predict sales - sports companies"
]
},
{
"cell_type": "markdown",
"id": "415e466a-1a71-4150-bff7-2f8904766df4",
"metadata": {},
"source": [
"## Importations"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b5aaf421-850a-4a86-8e99-2c1f0723bd6c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n",
"from sklearn.utils import class_weight\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
"from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
"from sklearn.naive_bayes import GaussianNB\n",
"\n",
"import pickle\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "c2f44070-451e-4109-9a08-3b80011d610f",
"metadata": {},
"source": [
"## Load data "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b5f8135f-b6e7-4d6d-b8e1-da185b944aff",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2668a243-4ff8-40c6-9de2-5c9c07bcf714",
"metadata": {},
"outputs": [],
"source": [
"def load_train_test():\n",
" BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n",
" File_path_train = BUCKET + \"/Train_set.csv\"\n",
" File_path_test = BUCKET + \"/Test_set.csv\"\n",
" \n",
" with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
"\n",
" with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
" # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
" \n",
" return dataset_train, dataset_test"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "13eba3e1-3ea5-435b-8b05-6d7d5744cbe2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1481/2459610029.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n"
]
},
{
"data": {
"text/plain": [
"customer_id 0\n",
"nb_tickets 0\n",
"nb_purchases 0\n",
"total_amount 0\n",
"nb_suppliers 0\n",
"vente_internet_max 0\n",
"purchase_date_min 0\n",
"purchase_date_max 0\n",
"time_between_purchase 0\n",
"nb_tickets_internet 0\n",
"street_id 0\n",
"structure_id 222825\n",
"mcp_contact_id 70874\n",
"fidelity 0\n",
"tenant_id 0\n",
"is_partner 0\n",
"deleted_at 224213\n",
"gender 0\n",
"is_email_true 0\n",
"opt_in 0\n",
"last_buying_date 66139\n",
"max_price 66139\n",
"ticket_sum 0\n",
"average_price 66023\n",
"average_purchase_delay 66139\n",
"average_price_basket 66139\n",
"average_ticket_basket 66139\n",
"total_price 116\n",
"purchase_count 0\n",
"first_buying_date 66139\n",
"country 23159\n",
"gender_label 0\n",
"gender_female 0\n",
"gender_male 0\n",
"gender_other 0\n",
"country_fr 23159\n",
"nb_campaigns 0\n",
"nb_campaigns_opened 0\n",
"time_to_open 123159\n",
"y_has_purchased 0\n",
"dtype: int64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_train, dataset_test = load_train_test()\n",
"dataset_train.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e46622e7-0fc1-43f8-a7e7-34a5e90068b2",
"metadata": {},
"outputs": [],
"source": [
"def features_target_split(dataset_train, dataset_test):\n",
" \"\"\"\n",
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n",
" 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n",
" 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n",
" \"\"\"\n",
"\n",
" # we suppress fidelity, time between purchase, and gender other (colinearity issue)\n",
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', \n",
" 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true', \n",
" 'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']\n",
" \n",
" X_train = dataset_train[features_l]\n",
" y_train = dataset_train[['y_has_purchased']]\n",
"\n",
" X_test = dataset_test[features_l]\n",
" y_test = dataset_test[['y_has_purchased']]\n",
" return X_train, X_test, y_train, y_test"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "cec4f386-e643-4bd8-b8cd-8917d2c1b3d0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape train : (224213, 14)\n",
"Shape test : (96096, 14)\n"
]
}
],
"source": [
"X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)\n",
"print(\"Shape train : \", X_train.shape)\n",
"print(\"Shape test : \", X_test.shape)"
]
},
{
"cell_type": "markdown",
"id": "c9e8edbd-7ff6-42f9-a8eb-10d27ca19c8a",
"metadata": {},
"source": [
"## Logistic"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "639b432a-c39c-4bf8-8ee2-e136d156e0dd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0.0: 0.5837086520288036, 1.0: 3.486549107420539}"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Compute Weights\n",
"weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n",
" y = y_train['y_has_purchased'])\n",
"\n",
"weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n",
"weight_dict"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "34644a00-85a5-41c9-98df-41178cb3ac69",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>60.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>355.268981</td>\n",
" <td>355.268981</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8.0</td>\n",
" <td>3.0</td>\n",
" <td>140.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>373.540289</td>\n",
" <td>219.262269</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>50.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.202442</td>\n",
" <td>5.202442</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>90.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.178958</td>\n",
" <td>5.178958</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>78.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.174039</td>\n",
" <td>5.174039</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224208</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>34.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224209</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>20.00</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>392.501030</td>\n",
" <td>392.501030</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>23.0</td>\n",
" <td>6.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224210</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224211</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>97.11</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>172.334074</td>\n",
" <td>172.334074</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>13.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224212</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>224213 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 2.0 1.0 60.00 1.0 \n",
"1 8.0 3.0 140.00 1.0 \n",
"2 2.0 1.0 50.00 1.0 \n",
"3 3.0 1.0 90.00 1.0 \n",
"4 2.0 1.0 78.00 1.0 \n",
"... ... ... ... ... \n",
"224208 0.0 0.0 0.00 0.0 \n",
"224209 1.0 1.0 20.00 1.0 \n",
"224210 0.0 0.0 0.00 0.0 \n",
"224211 1.0 1.0 97.11 1.0 \n",
"224212 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 355.268981 355.268981 \n",
"1 0.0 373.540289 219.262269 \n",
"2 0.0 5.202442 5.202442 \n",
"3 0.0 5.178958 5.178958 \n",
"4 0.0 5.174039 5.174039 \n",
"... ... ... ... \n",
"224208 0.0 550.000000 550.000000 \n",
"224209 1.0 392.501030 392.501030 \n",
"224210 0.0 550.000000 550.000000 \n",
"224211 1.0 172.334074 172.334074 \n",
"224212 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"0 0.0 True False 0 \n",
"1 0.0 True False 0 \n",
"2 0.0 True False 0 \n",
"3 0.0 True False 0 \n",
"4 0.0 True False 1 \n",
"... ... ... ... ... \n",
"224208 0.0 True False 0 \n",
"224209 1.0 True False 0 \n",
"224210 0.0 True True 0 \n",
"224211 1.0 True False 0 \n",
"224212 0.0 True False 0 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened \n",
"0 1 0.0 0.0 \n",
"1 1 0.0 0.0 \n",
"2 1 0.0 0.0 \n",
"3 1 0.0 0.0 \n",
"4 0 0.0 0.0 \n",
"... ... ... ... \n",
"224208 1 34.0 3.0 \n",
"224209 1 23.0 6.0 \n",
"224210 1 8.0 4.0 \n",
"224211 1 13.0 5.0 \n",
"224212 1 4.0 4.0 \n",
"\n",
"[224213 rows x 14 columns]"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "295676df-36ac-43d8-8b31-49ff08efd6e7",
"metadata": {},
"outputs": [],
"source": [
"# preprocess data \n",
"# numeric features - standardize\n",
"# categorical features - encode\n",
"# encoded features - do nothing\n",
"\n",
"numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', \n",
" 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'nb_campaigns', \n",
" 'nb_campaigns_opened' # , 'gender_male', 'gender_female'\n",
" ]\n",
"\n",
"numeric_transformer = Pipeline(steps=[\n",
" #(\"imputer\", SimpleImputer(strategy=\"mean\")), \n",
" (\"scaler\", StandardScaler()) \n",
"])\n",
"\n",
"categorical_features = ['opt_in', 'is_email_true'] \n",
"\n",
"# Transformer for the categorical features\n",
"categorical_transformer = Pipeline(steps=[\n",
" #(\"imputer\", SimpleImputer(strategy=\"most_frequent\")), # Impute missing values with the most frequent\n",
" (\"onehot\", OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n",
"])\n",
"\n",
"preproc = ColumnTransformer(\n",
" transformers=[\n",
" (\"num\", numeric_transformer, numeric_features),\n",
" (\"cat\", categorical_transformer, categorical_features)\n",
" ]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "f46fb56e-c908-40b4-868f-9684d1ae01c2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"nb_tickets 0\n",
"nb_purchases 0\n",
"total_amount 0\n",
"nb_suppliers 0\n",
"vente_internet_max 0\n",
"purchase_date_min 0\n",
"purchase_date_max 0\n",
"nb_tickets_internet 0\n",
"nb_campaigns 0\n",
"nb_campaigns_opened 0\n",
"dtype: int64"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train[numeric_features].isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "e729781b-4d65-42c5-bdc5-82b4d653aaf0",
"metadata": {},
"outputs": [],
"source": [
"# Set loss\n",
"balanced_scorer = make_scorer(balanced_accuracy_score)\n",
"recall_scorer = make_scorer(recall_score)\n",
"f1_scorer = make_scorer(f1_score)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "a7ebbe6f-70ba-4276-be18-f10e7bfd7423",
"metadata": {},
"outputs": [],
"source": [
"def draw_confusion_matrix(y_test, y_pred):\n",
" conf_matrix = confusion_matrix(y_test, y_pred)\n",
" sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
" plt.xlabel('Predicted')\n",
" plt.ylabel('Actual')\n",
" plt.title('Confusion Matrix')\n",
" plt.show()\n",
"\n",
"\n",
"def draw_roc_curve(X_test, y_test):\n",
" y_pred_prob = pipeline.predict_proba(X_test)[:, 1]\n",
"\n",
" # Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)\n",
" fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)\n",
" \n",
" # Calcul de l'aire sous la courbe ROC (AUC)\n",
" roc_auc = auc(fpr, tpr)\n",
" \n",
" plt.figure(figsize = (14, 8))\n",
" plt.plot(fpr, tpr, label=\"ROC curve(area = %0.3f)\" % roc_auc)\n",
" plt.plot([0, 1], [0, 1], color=\"red\",label=\"Random Baseline\", linestyle=\"--\")\n",
" plt.grid(color='gray', linestyle='--', linewidth=0.5)\n",
" plt.xlabel('Taux de faux positifs (FPR)')\n",
" plt.ylabel('Taux de vrais positifs (TPR)')\n",
" plt.title('Courbe ROC : modèle logistique')\n",
" plt.legend(loc=\"lower right\")\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 102,
"id": "2334eb51-e6ea-4fd0-89ce-f54cd474d332",
"metadata": {},
"outputs": [],
"source": [
"def draw_features_importance(pipeline, model):\n",
" coefficients = pipeline.named_steps['logreg'].coef_[0]\n",
" feature_names = pipeline.named_steps['logreg'].feature_names_in_\n",
" \n",
" # Tracer l'importance des caractéristiques\n",
" plt.figure(figsize=(10, 6))\n",
" plt.barh(feature_names, coefficients, color='skyblue')\n",
" plt.xlabel('Importance des caractéristiques')\n",
" plt.ylabel('Caractéristiques')\n",
" plt.title('Importance des caractéristiques dans le modèle de régression logistique')\n",
" plt.grid(True)\n",
" plt.show()\n",
"\n",
"def draw_prob_distribution(X_test):\n",
" y_pred_prob = pipeline.predict_proba(X_test)[:, 1]\n",
" plt.figure(figsize=(8, 6))\n",
" plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)\n",
" \n",
" plt.xlim(0, 1)\n",
" plt.ylim(0, None)\n",
" \n",
" plt.title('Histogramme des probabilités pour la classe 1')\n",
" plt.xlabel('Probabilité')\n",
" plt.ylabel('Fréquence')\n",
" plt.grid(True)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "83917b97-4d9b-4e3c-ba27-1e546ce885d3",
"metadata": {},
"outputs": [],
"source": [
"# Hyperparameter\n",
"\n",
"param_c = np.logspace(-10, 4, 15, base=2)\n",
"# param_penalty_type = ['l1', 'l2', 'elasticnet']\n",
"param_penalty_type = ['l1']\n",
"param_grid = {'logreg__C': param_c,\n",
" 'logreg__penalty': param_penalty_type} "
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "3ae25049-920c-4a6d-a59d-c26e3b45dec6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1024"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"2 ** 10"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "ba4cde9f-a614-4a43-81b9-e16e78aa6c4c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-5 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-5 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-5 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-5 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-5 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-5 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-5 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-5 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-5 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-5 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-5 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-5 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-5 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-5 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-5 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-5 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-5 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-5\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;logreg&#x27;,\n",
" LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link \">i<span>Not fitted</span></span></label><div class=\"sk-toggleable__content \"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;logreg&#x27;,\n",
" LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content \"><pre>ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;, StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;, &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;, &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;, &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-7\" type=\"checkbox\" ><label for=\"sk-estimator-id-7\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">num</label><div class=\"sk-toggleable__content \"><pre>[&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;, &#x27;nb_suppliers&#x27;, &#x27;vente_internet_max&#x27;, &#x27;purchase_date_min&#x27;, &#x27;purchase_date_max&#x27;, &#x27;nb_tickets_internet&#x27;, &#x27;nb_campaigns&#x27;, &#x27;nb_campaigns_opened&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-8\" type=\"checkbox\" ><label for=\"sk-estimator-id-8\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;StandardScaler<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content \"><pre>StandardScaler()</pre></div> </div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-9\" type=\"checkbox\" ><label for=\"sk-estimator-id-9\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">cat</label><div class=\"sk-toggleable__content \"><pre>[&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-10\" type=\"checkbox\" ><label for=\"sk-estimator-id-10\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;OneHotEncoder<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content \"><pre>OneHotEncoder(handle_unknown=&#x27;ignore&#x27;, sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-11\" type=\"checkbox\" ><label for=\"sk-estimator-id-11\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></label><div class=\"sk-toggleable__content \"><pre>LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;)</pre></div> </div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in',\n",
" 'is_email_true'])])),\n",
" ('logreg',\n",
" LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver='saga'))])"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Pipeline\n",
"pipeline = Pipeline(steps=[\n",
" ('preprocessor', preproc),\n",
" ('logreg', LogisticRegression(solver='saga', class_weight = weight_dict,\n",
" max_iter=5000)) \n",
"])\n",
"\n",
"pipeline.set_output(transform=\"pandas\")"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "1e4c1be5-176d-4222-9b3c-fe27225afe36",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>39626</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158560</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>20.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>170411</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>62.11</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>350.010093</td>\n",
" <td>350.010093</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>40.0</td>\n",
" <td>23.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220692</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>84.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.158787</td>\n",
" <td>5.158787</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>182741</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>19.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>194275</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>38.0</td>\n",
" <td>19.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142915</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>26.0</td>\n",
" <td>8.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95021</th>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>250.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>382.280455</td>\n",
" <td>382.279877</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197603</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>21.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88679</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>5.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10000 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"39626 0.0 0.0 0.00 0.0 \n",
"158560 0.0 0.0 0.00 0.0 \n",
"170411 1.0 1.0 62.11 1.0 \n",
"220692 1.0 1.0 84.00 1.0 \n",
"182741 0.0 0.0 0.00 0.0 \n",
"... ... ... ... ... \n",
"194275 0.0 0.0 0.00 0.0 \n",
"142915 0.0 0.0 0.00 0.0 \n",
"95021 7.0 2.0 250.00 1.0 \n",
"197603 0.0 0.0 0.00 0.0 \n",
"88679 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"39626 0.0 550.000000 550.000000 \n",
"158560 0.0 550.000000 550.000000 \n",
"170411 1.0 350.010093 350.010093 \n",
"220692 0.0 5.158787 5.158787 \n",
"182741 0.0 550.000000 550.000000 \n",
"... ... ... ... \n",
"194275 0.0 550.000000 550.000000 \n",
"142915 0.0 550.000000 550.000000 \n",
"95021 0.0 382.280455 382.279877 \n",
"197603 0.0 550.000000 550.000000 \n",
"88679 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"39626 0.0 True True 0 \n",
"158560 0.0 True True 0 \n",
"170411 1.0 True False 0 \n",
"220692 0.0 True False 0 \n",
"182741 0.0 True True 0 \n",
"... ... ... ... ... \n",
"194275 0.0 True False 1 \n",
"142915 0.0 True True 0 \n",
"95021 0.0 True True 0 \n",
"197603 0.0 True True 0 \n",
"88679 0.0 True False 0 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened \n",
"39626 0 9.0 0.0 \n",
"158560 0 20.0 5.0 \n",
"170411 1 40.0 23.0 \n",
"220692 1 0.0 0.0 \n",
"182741 1 19.0 1.0 \n",
"... ... ... ... \n",
"194275 0 38.0 19.0 \n",
"142915 1 26.0 8.0 \n",
"95021 0 0.0 0.0 \n",
"197603 1 21.0 0.0 \n",
"88679 1 5.0 0.0 \n",
"\n",
"[10000 rows x 14 columns]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# reduce X_train to reduce the training time\n",
"\n",
"X_train_subsample = X_train.sample(n=10000, random_state=43)\n",
"y_train_subsample = y_train.loc[X_train_subsample.index]\n",
"X_train_subsample"
]
},
{
"cell_type": "code",
"execution_count": 108,
"id": "2b09c2cd-fd5c-49b3-be66-cec6c5ec1351",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>y_has_purchased</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>43000</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183923</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97373</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66956</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116487</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140473</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>153768</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110886</th>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>115390</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24919</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1000 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" y_has_purchased\n",
"43000 0.0\n",
"183923 0.0\n",
"97373 0.0\n",
"66956 1.0\n",
"116487 0.0\n",
"... ...\n",
"140473 0.0\n",
"153768 0.0\n",
"110886 1.0\n",
"115390 0.0\n",
"24919 0.0\n",
"\n",
"[1000 rows x 1 columns]"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train_subsample"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "6c33fcd8-17d8-4390-b836-faec9ada9acd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-6 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-6 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-6 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-6 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-6 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-6 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-6 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-6 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-6 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-6 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-6 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-6 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-6 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-6 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-6 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-6 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-6 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-6 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-6 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-6 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-6 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-6 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-6 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-6 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-6 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-6 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-6\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;logreg&#x27;,\n",
" LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-12\" type=\"checkbox\" ><label for=\"sk-estimator-id-12\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link \">i<span>Not fitted</span></span></label><div class=\"sk-toggleable__content \"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;logreg&#x27;,\n",
" LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-13\" type=\"checkbox\" ><label for=\"sk-estimator-id-13\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content \"><pre>ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;, StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;, &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;, &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;, &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-14\" type=\"checkbox\" ><label for=\"sk-estimator-id-14\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">num</label><div class=\"sk-toggleable__content \"><pre>[&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;, &#x27;nb_suppliers&#x27;, &#x27;vente_internet_max&#x27;, &#x27;purchase_date_min&#x27;, &#x27;purchase_date_max&#x27;, &#x27;nb_tickets_internet&#x27;, &#x27;nb_campaigns&#x27;, &#x27;nb_campaigns_opened&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-15\" type=\"checkbox\" ><label for=\"sk-estimator-id-15\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;StandardScaler<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content \"><pre>StandardScaler()</pre></div> </div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-16\" type=\"checkbox\" ><label for=\"sk-estimator-id-16\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">cat</label><div class=\"sk-toggleable__content \"><pre>[&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-17\" type=\"checkbox\" ><label for=\"sk-estimator-id-17\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;OneHotEncoder<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content \"><pre>OneHotEncoder(handle_unknown=&#x27;ignore&#x27;, sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-18\" type=\"checkbox\" ><label for=\"sk-estimator-id-18\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></label><div class=\"sk-toggleable__content \"><pre>LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;)</pre></div> </div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in',\n",
" 'is_email_true'])])),\n",
" ('logreg',\n",
" LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver='saga'))])"
]
},
"execution_count": 109,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline"
]
},
{
"cell_type": "code",
"execution_count": 110,
"id": "710ccccc-50c9-4aba-8cf1-11483dbbdd1c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'logreg__C': array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n",
" 'logreg__penalty': ['l1']}"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"param_grid"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "ab078cf8-0d4c-4b23-9f33-2483cf605b06",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"make_scorer(f1_score, response_method='predict')"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f1_scorer"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "8062169e-8305-42b0-aeff-8f714117da40",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>39626</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158560</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>20.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>170411</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>62.11</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>350.010093</td>\n",
" <td>350.010093</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>40.0</td>\n",
" <td>23.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220692</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>84.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.158787</td>\n",
" <td>5.158787</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>182741</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>19.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>194275</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>38.0</td>\n",
" <td>19.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142915</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>26.0</td>\n",
" <td>8.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95021</th>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>250.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>382.280455</td>\n",
" <td>382.279877</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197603</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>21.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88679</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>5.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10000 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"39626 0.0 0.0 0.00 0.0 \n",
"158560 0.0 0.0 0.00 0.0 \n",
"170411 1.0 1.0 62.11 1.0 \n",
"220692 1.0 1.0 84.00 1.0 \n",
"182741 0.0 0.0 0.00 0.0 \n",
"... ... ... ... ... \n",
"194275 0.0 0.0 0.00 0.0 \n",
"142915 0.0 0.0 0.00 0.0 \n",
"95021 7.0 2.0 250.00 1.0 \n",
"197603 0.0 0.0 0.00 0.0 \n",
"88679 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"39626 0.0 550.000000 550.000000 \n",
"158560 0.0 550.000000 550.000000 \n",
"170411 1.0 350.010093 350.010093 \n",
"220692 0.0 5.158787 5.158787 \n",
"182741 0.0 550.000000 550.000000 \n",
"... ... ... ... \n",
"194275 0.0 550.000000 550.000000 \n",
"142915 0.0 550.000000 550.000000 \n",
"95021 0.0 382.280455 382.279877 \n",
"197603 0.0 550.000000 550.000000 \n",
"88679 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"39626 0.0 True True 0 \n",
"158560 0.0 True True 0 \n",
"170411 1.0 True False 0 \n",
"220692 0.0 True False 0 \n",
"182741 0.0 True True 0 \n",
"... ... ... ... ... \n",
"194275 0.0 True False 1 \n",
"142915 0.0 True True 0 \n",
"95021 0.0 True True 0 \n",
"197603 0.0 True True 0 \n",
"88679 0.0 True False 0 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened \n",
"39626 0 9.0 0.0 \n",
"158560 0 20.0 5.0 \n",
"170411 1 40.0 23.0 \n",
"220692 1 0.0 0.0 \n",
"182741 1 19.0 1.0 \n",
"... ... ... ... \n",
"194275 0 38.0 19.0 \n",
"142915 1 26.0 8.0 \n",
"95021 0 0.0 0.0 \n",
"197603 1 21.0 0.0 \n",
"88679 1 5.0 0.0 \n",
"\n",
"[10000 rows x 14 columns]"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train_subsample"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "0270013a-6523-4cf8-8de0-569c0d1c5db5",
"metadata": {},
"outputs": [],
"source": [
"warnings.filterwarnings('ignore')\n",
"warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n",
"warnings.filterwarnings(\"ignore\", category=DataConversionWarning)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "7a49d78a-5a9b-44a9-95cf-3fca1b3febfa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Returned hyperparameter: {'logreg__C': 0.0625, 'logreg__penalty': 'l1'}\n",
"Best classification F1 score in train is: 0.462769170101807\n",
"Classification F1 score on test is: 0.46474681703251214\n"
]
}
],
"source": [
"# run the pipeline on the subsample\n",
"\n",
"logit_grid = GridSearchCV(pipeline, param_grid, cv=3, scoring = f1_scorer #, error_score=\"raise\"\n",
" )\n",
"logit_grid.fit(X_train_subsample, y_train_subsample)\n",
"\n",
"# print results\n",
"print('Returned hyperparameter: {}'.format(logit_grid.best_params_))\n",
"print('Best classification F1 score in train is: {}'.format(logit_grid.best_score_))\n",
"print('Classification F1 score on test is: {}'.format(logit_grid.score(X_test, y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 114,
"id": "b1d5e71d-1078-4370-86e8-52b1ae378898",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01])"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"param_c"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "cfe04739-fe9c-4802-9d34-885a8cfce0dc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-12 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-12 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-12 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-12 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-12 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-12 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-12 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-12 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-12 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-12 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-12 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-12 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-12 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-12 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-12 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-12 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-12 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-12 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-12 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-12 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-12 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-12 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-12 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-12 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-12 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-12 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-12\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;,\n",
" &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(...\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000,\n",
" solver=&#x27;saga&#x27;))]),\n",
" param_grid={&#x27;logreg__C&#x27;: array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n",
" &#x27;logreg__penalty&#x27;: [&#x27;l1&#x27;]},\n",
" scoring=make_scorer(f1_score, response_method=&#x27;predict&#x27;))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-75\" type=\"checkbox\" ><label for=\"sk-estimator-id-75\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;&nbsp;GridSearchCV<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.model_selection.GridSearchCV.html\">?<span>Documentation for GridSearchCV</span></a><span class=\"sk-estimator-doc-link \">i<span>Not fitted</span></span></label><div class=\"sk-toggleable__content \"><pre>GridSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;,\n",
" &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(...\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000,\n",
" solver=&#x27;saga&#x27;))]),\n",
" param_grid={&#x27;logreg__C&#x27;: array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n",
" &#x27;logreg__penalty&#x27;: [&#x27;l1&#x27;]},\n",
" scoring=make_scorer(f1_score, response_method=&#x27;predict&#x27;))</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-76\" type=\"checkbox\" ><label for=\"sk-estimator-id-76\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">estimator: Pipeline</label><div class=\"sk-toggleable__content \"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;logreg&#x27;,\n",
" LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-77\" type=\"checkbox\" ><label for=\"sk-estimator-id-77\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content \"><pre>ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;, StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;, &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;, &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;, &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-78\" type=\"checkbox\" ><label for=\"sk-estimator-id-78\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">num</label><div class=\"sk-toggleable__content \"><pre>[&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;, &#x27;nb_suppliers&#x27;, &#x27;vente_internet_max&#x27;, &#x27;purchase_date_min&#x27;, &#x27;purchase_date_max&#x27;, &#x27;nb_tickets_internet&#x27;, &#x27;nb_campaigns&#x27;, &#x27;nb_campaigns_opened&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-79\" type=\"checkbox\" ><label for=\"sk-estimator-id-79\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;StandardScaler<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content \"><pre>StandardScaler()</pre></div> </div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-80\" type=\"checkbox\" ><label for=\"sk-estimator-id-80\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">cat</label><div class=\"sk-toggleable__content \"><pre>[&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-81\" type=\"checkbox\" ><label for=\"sk-estimator-id-81\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;OneHotEncoder<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content \"><pre>OneHotEncoder(handle_unknown=&#x27;ignore&#x27;, sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-82\" type=\"checkbox\" ><label for=\"sk-estimator-id-82\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></label><div class=\"sk-toggleable__content \"><pre>LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;)</pre></div> </div></div></div></div></div></div></div></div></div></div></div>"
],
"text/plain": [
"GridSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[(...\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000,\n",
" solver='saga'))]),\n",
" param_grid={'logreg__C': array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n",
" 'logreg__penalty': ['l1']},\n",
" scoring=make_scorer(f1_score, response_method='predict'))"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logit_grid = GridSearchCV(pipeline, param_grid, cv=3, scoring = f1_scorer #, error_score=\"raise\"\n",
" )\n",
"logit_grid"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "6debc66c-a56d-41fa-8ef8-ba388e0e14fe",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'logreg__C': array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n",
" 'logreg__penalty': ['l1']}"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"param_grid"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "e394cc04-5d0b-4a64-9aa0-415dc8a3cbbc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Returned hyperparameter: {'logreg__C': 0.03125, 'logreg__penalty': 'l1'}\n",
"Best classification accuracy in train is: 0.42160313383818665\n",
"Classification accuracy on test is: 0.47078982841737305\n"
]
}
],
"source": [
"# run the pipeline on the full sample\n",
"\n",
"logit_grid = GridSearchCV(pipeline, param_grid, cv=3, scoring = f1_scorer #, error_score=\"raise\"\n",
" )\n",
"logit_grid.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "8e6cf558-a4f4-4159-9835-364ee3bb1ed2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Returned hyperparameter: {'logreg__C': 0.03125, 'logreg__penalty': 'l1'}\n",
"Best classification F1 score in train is: 0.42160313383818665\n",
"Classification F1 score on test is: 0.47078982841737305\n"
]
}
],
"source": [
"# print results\n",
"print('Returned hyperparameter: {}'.format(logit_grid.best_params_))\n",
"print('Best classification F1 score in train is: {}'.format(logit_grid.best_score_))\n",
"print('Classification F1 score on test is: {}'.format(logit_grid.score(X_test, y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "e2ff26cb-f137-4a23-9add-bdb61bebdf9c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-13 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-13 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-13 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-13 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-13 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-13 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-13 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-13 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-13 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-13 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-13 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-13 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-13 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-13 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-13 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-13 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-13 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-13 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-13 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-13 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-13 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-13 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-13 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-13 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-13 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-13 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-13\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;,\n",
" &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(...\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000,\n",
" solver=&#x27;saga&#x27;))]),\n",
" param_grid={&#x27;logreg__C&#x27;: array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n",
" &#x27;logreg__penalty&#x27;: [&#x27;l1&#x27;]},\n",
" scoring=make_scorer(f1_score, response_method=&#x27;predict&#x27;))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-83\" type=\"checkbox\" ><label for=\"sk-estimator-id-83\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;GridSearchCV<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.model_selection.GridSearchCV.html\">?<span>Documentation for GridSearchCV</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>GridSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;,\n",
" &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(...\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000,\n",
" solver=&#x27;saga&#x27;))]),\n",
" param_grid={&#x27;logreg__C&#x27;: array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n",
" &#x27;logreg__penalty&#x27;: [&#x27;l1&#x27;]},\n",
" scoring=make_scorer(f1_score, response_method=&#x27;predict&#x27;))</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-84\" type=\"checkbox\" ><label for=\"sk-estimator-id-84\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">estimator: Pipeline</label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;,\n",
" &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;logreg&#x27;,\n",
" LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-85\" type=\"checkbox\" ><label for=\"sk-estimator-id-85\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;, StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;, &#x27;vente_internet_max&#x27;,\n",
" &#x27;purchase_date_min&#x27;, &#x27;purchase_date_max&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;, &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-86\" type=\"checkbox\" ><label for=\"sk-estimator-id-86\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">num</label><div class=\"sk-toggleable__content fitted\"><pre>[&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;, &#x27;nb_suppliers&#x27;, &#x27;vente_internet_max&#x27;, &#x27;purchase_date_min&#x27;, &#x27;purchase_date_max&#x27;, &#x27;nb_tickets_internet&#x27;, &#x27;nb_campaigns&#x27;, &#x27;nb_campaigns_opened&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-87\" type=\"checkbox\" ><label for=\"sk-estimator-id-87\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;StandardScaler<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>StandardScaler()</pre></div> </div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-88\" type=\"checkbox\" ><label for=\"sk-estimator-id-88\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">cat</label><div class=\"sk-toggleable__content fitted\"><pre>[&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-89\" type=\"checkbox\" ><label for=\"sk-estimator-id-89\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;OneHotEncoder<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>OneHotEncoder(handle_unknown=&#x27;ignore&#x27;, sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-90\" type=\"checkbox\" ><label for=\"sk-estimator-id-90\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver=&#x27;saga&#x27;)</pre></div> </div></div></div></div></div></div></div></div></div></div></div>"
],
"text/plain": [
"GridSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[(...\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000,\n",
" solver='saga'))]),\n",
" param_grid={'logreg__C': array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n",
" 'logreg__penalty': ['l1']},\n",
" scoring=make_scorer(f1_score, response_method='predict'))"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logit_grid"
]
},
{
"cell_type": "code",
"execution_count": 105,
"id": "5d553da2-5c2a-491a-b4d2-f31c30c201a6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'scoring': make_scorer(f1_score, response_method='predict'),\n",
" 'estimator': Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in',\n",
" 'is_email_true'])])),\n",
" ('logreg',\n",
" LogisticRegression(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, solver='saga'))]),\n",
" 'n_jobs': None,\n",
" 'refit': True,\n",
" 'cv': 3,\n",
" 'verbose': 0,\n",
" 'pre_dispatch': '2*n_jobs',\n",
" 'error_score': nan,\n",
" 'return_train_score': False,\n",
" 'param_grid': {'logreg__C': array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n",
" 'logreg__penalty': ['l1']},\n",
" 'multimetric_': False,\n",
" 'best_index_': 5,\n",
" 'best_score_': 0.42160313383818665,\n",
" 'best_params_': {'logreg__C': 0.03125, 'logreg__penalty': 'l1'},\n",
" 'best_estimator_': Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in',\n",
" 'is_email_true'])])),\n",
" ('logreg',\n",
" LogisticRegression(C=0.03125,\n",
" class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, penalty='l1',\n",
" solver='saga'))]),\n",
" 'refit_time_': 305.1356477737427,\n",
" 'feature_names_in_': array(['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
" 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',\n",
" 'nb_tickets_internet', 'is_email_true', 'opt_in', 'gender_female',\n",
" 'gender_male', 'nb_campaigns', 'nb_campaigns_opened'], dtype=object),\n",
" 'scorer_': make_scorer(f1_score, response_method='predict'),\n",
" 'cv_results_': {'mean_fit_time': array([ 11.07076669, 13.15744201, 27.35094929, 40.0343461 ,\n",
" 94.58210254, 140.45846391, 159.83818332, 162.80178094,\n",
" 163.94260454, 171.08749111, 169.26621262, 166.36741408,\n",
" 167.91208776, 173.06720233, 170.93666704]),\n",
" 'std_fit_time': array([ 0.09462032, 1.51362591, 6.70859141, 22.68643753, 28.72690872,\n",
" 70.8434823 , 85.23159321, 79.71538593, 82.70486235, 84.79706797,\n",
" 86.79005212, 84.67956107, 83.94889047, 89.68716252, 89.41361431]),\n",
" 'mean_score_time': array([0.11632609, 0.10857773, 0.18140252, 0.1291213 , 0.11651532,\n",
" 0.07535577, 0.12481014, 0.16039928, 0.15685773, 0.07996233,\n",
" 0.12988146, 0.10067987, 0.1194102 , 0.09737802, 0.09390028]),\n",
" 'std_score_time': array([0.02131792, 0.03620144, 0.05853886, 0.06555575, 0.03228018,\n",
" 0.01433186, 0.03501336, 0.05466042, 0.06882891, 0.01002881,\n",
" 0.00495894, 0.00905774, 0.04075337, 0.03269379, 0.01990173]),\n",
" 'param_logreg__C': masked_array(data=[0.0009765625, 0.001953125, 0.00390625, 0.0078125,\n",
" 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0, 2.0,\n",
" 4.0, 8.0, 16.0],\n",
" mask=[False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False],\n",
" fill_value='?',\n",
" dtype=object),\n",
" 'param_logreg__penalty': masked_array(data=['l1', 'l1', 'l1', 'l1', 'l1', 'l1', 'l1', 'l1', 'l1',\n",
" 'l1', 'l1', 'l1', 'l1', 'l1', 'l1'],\n",
" mask=[False, False, False, False, False, False, False, False,\n",
" False, False, False, False, False, False, False],\n",
" fill_value='?',\n",
" dtype=object),\n",
" 'params': [{'logreg__C': 0.0009765625, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 0.001953125, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 0.00390625, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 0.0078125, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 0.015625, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 0.03125, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 0.0625, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 0.125, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 0.25, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 0.5, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 1.0, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 2.0, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 4.0, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 8.0, 'logreg__penalty': 'l1'},\n",
" {'logreg__C': 16.0, 'logreg__penalty': 'l1'}],\n",
" 'split0_test_score': array([0.27289073, 0.2738913 , 0.27382853, 0.27409759, 0.27454764,\n",
" 0.27661894, 0.2766145 , 0.27584723, 0.27571682, 0.27576295,\n",
" 0.27580092, 0.27577943, 0.27581248, 0.27581909, 0.27581909]),\n",
" 'split1_test_score': array([0.4714244 , 0.47196015, 0.48362373, 0.48891733, 0.49066854,\n",
" 0.49091122, 0.49086284, 0.49065871, 0.49062783, 0.49049541,\n",
" 0.49048106, 0.49045238, 0.49043804, 0.49043804, 0.4904237 ]),\n",
" 'split2_test_score': array([0.50689906, 0.50092334, 0.4981377 , 0.49759178, 0.49725836,\n",
" 0.49727924, 0.49708801, 0.49738305, 0.49751781, 0.49738248,\n",
" 0.49738248, 0.49738248, 0.49738248, 0.49738248, 0.49738248]),\n",
" 'mean_test_score': array([0.4170714 , 0.4155916 , 0.41852999, 0.42020223, 0.42082484,\n",
" 0.42160313, 0.42152178, 0.42129633, 0.42128749, 0.42121361,\n",
" 0.42122149, 0.42120476, 0.421211 , 0.4212132 , 0.42120842]),\n",
" 'std_test_score': array([0.10297463, 0.1008925 , 0.10249081, 0.10337226, 0.10346859,\n",
" 0.10255226, 0.10249644, 0.10288467, 0.10297243, 0.10288758,\n",
" 0.10286646, 0.10287015, 0.10285136, 0.10284824, 0.10284503]),\n",
" 'rank_test_score': array([14, 15, 13, 12, 11, 1, 2, 3, 4, 6, 5, 10, 8, 7, 9],\n",
" dtype=int32)},\n",
" 'n_splits_': 3}"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logit_grid.__dict__"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "3573f34e-25d5-4afb-82cc-52323e2f63c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.67553011, 0. , 0.14254288, 0.41574295, 0.03458744,\n",
" 0.64769185, -1.20510095, 0. , 0.01018587, 0.13959519,\n",
" 0.24222266, -0.68253886, 0. , 0. ]])"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# coefficients trouvés pour le modèle optimal\n",
"logit_grid.best_estimator_.named_steps[\"logreg\"].coef_"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "0332a814-61fb-4b71-836a-e8ace70b1a44",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'preprocessor': ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler', StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases', 'total_amount',\n",
" 'nb_suppliers', 'vente_internet_max',\n",
" 'purchase_date_min', 'purchase_date_max',\n",
" 'nb_tickets_internet', 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in', 'is_email_true'])]),\n",
" 'logreg': LogisticRegression(C=0.0625,\n",
" class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539},\n",
" max_iter=5000, penalty='l1', solver='saga')}"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logit_grid.best_estimator_.named_steps"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "287615b9-e062-4b84-be61-26b9364b2cf4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.44041477])"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logit_grid.best_estimator_.named_steps[\"logreg\"].intercept_"
]
},
{
"cell_type": "code",
"execution_count": 115,
"id": "4d50899d-cc0b-4a71-9406-f8b0a277c4a6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>60.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>355.268981</td>\n",
" <td>355.268981</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8.0</td>\n",
" <td>3.0</td>\n",
" <td>140.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>373.540289</td>\n",
" <td>219.262269</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>50.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.202442</td>\n",
" <td>5.202442</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>90.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.178958</td>\n",
" <td>5.178958</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>78.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.174039</td>\n",
" <td>5.174039</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224208</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>34.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224209</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>20.00</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>392.501030</td>\n",
" <td>392.501030</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>23.0</td>\n",
" <td>6.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224210</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224211</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>97.11</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>172.334074</td>\n",
" <td>172.334074</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>13.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224212</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>224213 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 2.0 1.0 60.00 1.0 \n",
"1 8.0 3.0 140.00 1.0 \n",
"2 2.0 1.0 50.00 1.0 \n",
"3 3.0 1.0 90.00 1.0 \n",
"4 2.0 1.0 78.00 1.0 \n",
"... ... ... ... ... \n",
"224208 0.0 0.0 0.00 0.0 \n",
"224209 1.0 1.0 20.00 1.0 \n",
"224210 0.0 0.0 0.00 0.0 \n",
"224211 1.0 1.0 97.11 1.0 \n",
"224212 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 355.268981 355.268981 \n",
"1 0.0 373.540289 219.262269 \n",
"2 0.0 5.202442 5.202442 \n",
"3 0.0 5.178958 5.178958 \n",
"4 0.0 5.174039 5.174039 \n",
"... ... ... ... \n",
"224208 0.0 550.000000 550.000000 \n",
"224209 1.0 392.501030 392.501030 \n",
"224210 0.0 550.000000 550.000000 \n",
"224211 1.0 172.334074 172.334074 \n",
"224212 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"0 0.0 True False 0 \n",
"1 0.0 True False 0 \n",
"2 0.0 True False 0 \n",
"3 0.0 True False 0 \n",
"4 0.0 True False 1 \n",
"... ... ... ... ... \n",
"224208 0.0 True False 0 \n",
"224209 1.0 True False 0 \n",
"224210 0.0 True True 0 \n",
"224211 1.0 True False 0 \n",
"224212 0.0 True False 0 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened \n",
"0 1 0.0 0.0 \n",
"1 1 0.0 0.0 \n",
"2 1 0.0 0.0 \n",
"3 1 0.0 0.0 \n",
"4 0 0.0 0.0 \n",
"... ... ... ... \n",
"224208 1 34.0 3.0 \n",
"224209 1 23.0 6.0 \n",
"224210 1 8.0 4.0 \n",
"224211 1 13.0 5.0 \n",
"224212 1 4.0 4.0 \n",
"\n",
"[224213 rows x 14 columns]"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# c'est la 2ème variable nb_purchases qui a été supprimée par le LASSO\n",
"X_train"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "e53b1f79-762d-4f1f-8505-91de1088af42",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"16.0"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# best param : alpha = 32 (alpha =1/4 sur le petit subsample)\n",
"1/logit_grid.best_params_[\"logreg__C\"]"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "41bcaaf6-ab58-4004-a3c5-586d77e872d1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy Score: 0.7510718448218449\n",
"F1 Score: 0.46474681703251214\n",
"Recall Score: 0.7585829072315559\n"
]
}
],
"source": [
"# print results for the best model\n",
"\n",
"y_pred = logit_grid.predict(X_test)\n",
"\n",
"# Calculate the F1 score\n",
"acc = accuracy_score(y_test, y_pred)\n",
"print(f\"Accuracy Score: {acc}\")\n",
"\n",
"f1 = f1_score(y_test, y_pred)\n",
"print(f\"F1 Score: {f1}\")\n",
"\n",
"recall = recall_score(y_test, y_pred)\n",
"print(f\"Recall Score: {recall}\")"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "a454bb57-76eb-4a22-9950-0733d39e449f",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAi0AAAHFCAYAAAA+FskAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYo0lEQVR4nO3de1zN9x8H8NfR5eh6dFHHIYQ0qY1lEoYNuZRmNyyLhvi5rsllMWqbyW2Y5X6fy2KS2dByW1sjWkTRXLOYjlxSJKdW398f5rsdheJ7nE5ez9/j+3is7+d9vt/P9/zWvL0/l69MEAQBRERERFVcDX13gIiIiKgimLQQERGRQWDSQkRERAaBSQsREREZBCYtREREZBCYtBAREZFBYNJCREREBoFJCxERERkEJi1ERERkEJi0ULV2/PhxfPDBB3B2dkbNmjVhaWmJl19+GbNmzcKNGzd0eu+jR4+iY8eOUCgUkMlkmD9/vuT3kMlkiIiIkPy6j7NmzRrIZDLIZDL8/PPPZdoFQUCTJk0gk8nQqVOnJ7rHokWLsGbNmkp95ueff35on4jI8BnruwNEurJ8+XKMGDECrq6uGD9+PNzc3FBcXIzff/8dS5YswcGDBxEbG6uz+w8aNAgFBQWIjo6GjY0NGjZsKPk9Dh48iHr16kl+3YqysrLCypUryyQmCQkJOHfuHKysrJ742osWLYK9vT2CgoIq/JmXX34ZBw8ehJub2xPfl4iqLiYtVC0dPHgQw4cPR9euXbFt2zbI5XKxrWvXrggNDUVcXJxO+5Ceno7g4GD06NFDZ/do06aNzq5dEX379sWGDRuwcOFCWFtbi+dXrlwJb29v5OfnP5N+FBcXQyaTwdraWu/fCRHpDoeHqFqaPn06ZDIZli1bppWw3Gdqagp/f3/x59LSUsyaNQsvvPAC5HI5HBwcMGDAAFy6dEnrc506dYK7uzuSk5Px6quvwtzcHI0aNcKMGTNQWloK4N+hk7///huLFy8Wh1EAICIiQvzn/7r/mQsXLojn9u3bh06dOsHOzg5mZmaoX78+3n77bdy5c0eMKW94KD09HW+88QZsbGxQs2ZNtGjRAmvXrtWKuT+M8u2332Ly5MlQqVSwtrZGly5dcOrUqYp9yQDee+89AMC3334rnsvLy0NMTAwGDRpU7mc+/fRTeHl5wdbWFtbW1nj55ZexcuVK/PfdrQ0bNsSJEyeQkJAgfn/3K1X3+75u3TqEhoaibt26kMvlOHv2bJnhoWvXrsHJyQlt27ZFcXGxeP2TJ0/CwsICgYGBFX5WItI/Ji1U7ZSUlGDfvn3w9PSEk5NThT4zfPhwTJw4EV27dsX27dvx+eefIy4uDm3btsW1a9e0YtVqNfr374/3338f27dvR48ePRAWFob169cDAHx9fXHw4EEAwDvvvIODBw+KP1fUhQsX4OvrC1NTU6xatQpxcXGYMWMGLCwsUFRU9NDPnTp1Cm3btsWJEyewYMECbN26FW5ubggKCsKsWbPKxE+aNAl//vknVqxYgWXLluHMmTPo1asXSkpKKtRPa2trvPPOO1i1apV47ttvv0WNGjXQt2/fhz7bsGHDsHnzZmzduhVvvfUWRo8ejc8//1yMiY2NRaNGjdCyZUvx+3twKC8sLAxZWVlYsmQJfvjhBzg4OJS5l729PaKjo5GcnIyJEycCAO7cuYN3330X9evXx5IlSyr0nERURQhE1YxarRYACP369atQfEZGhgBAGDFihNb5Q4cOCQCESZMmiec6duwoABAOHTqkFevm5iZ069ZN6xwAYeTIkVrnwsPDhfJ+7VavXi0AEDIzMwVBEIQtW7YIAITU1NRH9h2AEB4eLv7cr18/QS6XC1lZWVpxPXr0EMzNzYWbN28KgiAI+/fvFwAIPXv21IrbvHmzAEA4ePDgI+97v7/JycnitdLT0wVBEIRXXnlFCAoKEgRBEJo3by507NjxodcpKSkRiouLhc8++0yws7MTSktLxbaHffb+/Tp06PDQtv3792udnzlzpgBAiI2NFQYOHCiYmZkJx48ff+QzElHVw0oLPff2798PAGUmfLZu3RrNmjXD3r17tc4rlUq0bt1a69yLL76IP//8U7I+tWjRAqamphg6dCjWrl2L8+fPV+hz+/btQ+fOnctUmIKCgnDnzp0yFZ//DpEB954DQKWepWPHjmjcuDFWrVqFtLQ0JCcnP3Ro6H4fu3TpAoVCASMjI5iYmGDq1Km4fv06cnJyKnzft99+u8Kx48ePh6+vL9577z2sXbsWX3/9NTw8PCr8eSKqGpi0ULVjb28Pc3NzZGZmVij++vXrAIA6deqUaVOpVGL7fXZ2dmXi5HI5CgsLn6C35WvcuDH27NkDBwcHjBw5Eo0bN0bjxo3x1VdfPfJz169ff+hz3G//rwef5f78n8o8i0wmwwcffID169djyZIlaNq0KV599dVyYw8fPgwfHx8A91Z3/fbbb0hOTsbkyZMrfd/ynvNRfQwKCsLdu3ehVCo5l4XIQDFpoWrHyMgInTt3RkpKSpmJtOW5/wd3dnZ2mbbLly/D3t5esr7VrFkTAKDRaLTOPzhvBgBeffVV/PDDD8jLy0NSUhK8vb0REhKC6Ojoh17fzs7uoc8BQNJn+a+goCBcu3YNS5YswQcffPDQuOjoaJiYmODHH39Enz590LZtW7Rq1eqJ7lnehOaHyc7OxsiRI9GiRQtcv34d48aNe6J7EpF+MWmhaiksLAyCICA4OLjciavFxcX44YcfAACvv/46AIgTae9LTk5GRkYGOnfuLFm/7q+AOX78uNb5+30pj5GREby8vLBw4UIAwJEjRx4a27lzZ+zbt09MUu775ptvYG5urrPlwHXr1sX48ePRq1cvDBw48KFxMpkMxsbGMDIyEs8VFhZi3bp1ZWKlql6VlJTgvffeg0wmw65duxAZGYmvv/4aW7dufeprE9GzxX1aqFry9vbG4sWLMWLECHh6emL48OFo3rw5iouLcfToUSxbtgzu7u7o1asXXF1dMXToUHz99deoUaMGevTogQsXLmDKlClwcnLCRx99JFm/evbsCVtbWwwePBifffYZjI2NsWbNGly8eFErbsmSJdi3bx98fX1Rv3593L17V1yh06VLl4dePzw8HD/++CNee+01TJ06Fba2ttiwYQN27NiBWbNmQaFQSPYsD5oxY8ZjY3x9fTF37lwEBARg6NChuH79OubMmVPusnQPDw9ER0dj06ZNaNSoEWrWrPlE81DCw8Px66+/Ij4+HkqlEqGhoUhISMDgwYPRsmVLODs7V/qaRKQfTFqo2goODkbr1q0xb948zJw5E2q1GiYmJmjatCkCAgIwatQoMXbx4sVo3LgxVq5ciYULF0KhUKB79+6IjIwsdw7Lk7K2tkZcXBxCQkLw/vvvo1atWhgyZAh69OiBIUOGiHEtWrRAfHw8wsPDoVarYWlpCXd3d2zfvl2cE1IeV1dXHDhwAJMmTcLIkSNRWFiIZs2aYfXq1ZXaWVZXXn/9daxatQozZ85Er169ULduXQQHB8PBwQGDBw/Wiv3000+RnZ2N4OBg3Lp1Cw0aNNDax6Yidu/ejcjISEyZMkWrYrZmzRq0bNkSffv2RWJiIkxNTaV4PCLSMZkg/GdHJyIiIqIqinNaiIiIyCAwaSEiIiKDwKSFiIiIDAKTFiIiIjIITFqIiIjIIDBpISIiIoPApIWIiIgMQrXcXM6s5ajHBxE9h7ZtCNd3F4iqnG5utXV+D6n+XCo8GiXJdQwVKy1ERERkEKplpYWIiKhKkbFGIAUmLURERLomk+m7B9UCkxYiIiJdY6VFEvwWiYiIyCCw0kJERKRrHB6SBCstREREuiarIc1RSX/99Rfef/992NnZwdzcHC1atEBKSorYLggCIiIioFKpYGZmhk6dOuHEiRNa19BoNBg9ejTs7e1hYWEBf39/XLp0SSsmNzcXgYGBUCgUUCgUCAwMxM2bN7VisrKy0KtXL1hYWMDe3h5jxoxBUVFRpZ6HSQsREVE1lJubi3bt2sHExAS7du3CyZMn8eWXX6JWrVpizKxZszB37lx
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# confusion matrix \n",
"\n",
"draw_confusion_matrix(y_test, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "25ec1701-ade5-4419-8b46-8a1bb109cf84",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABIQAAAK8CAYAAACeK2TMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gUVfvG8e+mkwoBQgm9N+kCAekCUhQUEHkVUcHG+7OAFRsiKogFRV97b4gIgiIiICAiSJcqvXdCSSV15/fHwMKwARNJMrvZ+3NducicnU2eBO6d8OTMOQ7DMAxERERERERERMRn+NldgIiIiIiIiIiIFC41hEREREREREREfIwaQiIiIiIiIiIiPkYNIRERERERERERH6OGkIiIiIiIiIiIj1FDSERERERERETEx6ghJCIiIiIiIiLiY9QQEhERERERERHxMWoIiYiIiIiIiIj4GDWERERERLzc6NGjKVmyJHv27LG7FBEREfESagiJiIgUoHXr1nH77bdTtWpVQkJCCA8Pp2nTpowfP54TJ07YUtOnn36Kw+Fg5cqVBfp5du/ejcPhcL35+flRokQJOnfuzJw5cy76vNmzZ9OzZ09Kly5NcHAwFStWZPDgwWzatOmiz/n999+58cYbiY2NJSgoiKioKFq3bs0777xDSkpKQXx5tjj7d7d7927L+KhRo7j++uvp378/GRkZOT732WefxeFw5FstCxcuxOFwsHDhwnz7mDmpUqUKt912W56es2TJEp599llOnTrl9liHDh3o0KFDvtQmIiLizdQQEhERKSAffPABzZo1Y8WKFTzyyCPMnj2b77//nv79+/Puu+8yZMgQu0ssFPfddx9Lly7l999/55VXXmHbtm306NGDRYsWuZ376KOP0r17d5xOJ2+//TZz585l1KhRrFixgqZNmzJt2jS354waNYp27dpx4MABxowZw9y5c/nmm2/o3Lkzzz77LE899VRhfJm2e/fddyldujTDhw+3u5R89f333/P000/n6TlLlixh9OjROTaE3n77bd5+++18qk5ERMR7BdhdgIiISFG0dOlS7r33Xrp06cL06dMJDg52PdalSxceeughZs+eXag1ZWZm5usMkdyqVKkSrVq1AqBNmzbUrFmT9u3b89FHH9GuXTvXeZMmTeLll1/m3nvvtfyHvV27dgwcOJD27dszaNAgGjduTLVq1QCYMmUKzz33HEOGDOGDDz6wfH3du3fn0UcfZenSpYX0ldorICCAn376ye4y8l2TJk3y9ePVq1cvXz+eiIiIt9IMIRERkQLw4osv4nA4eP/99y3NoLOCgoK47rrrXMdOp5Px48dTp04dgoODiYmJ4dZbb2X//v2W513s9pkLb4M5ezvPF198wUMPPURsbCzBwcFs377ddc7Jkye5/fbbiY6OJiwsjGuvvZadO3e6fex58+bRuXNnIiMjCQ0NpU2bNvz666//4rtiat68OQBHjhyxjL/wwguUKFGCV155xe05YWFhvPnmm6SmpjJhwgTX+HPPPUeJEiWYOHFijs2uiIgIunbt+q9rvVCHDh1o0KABS5cupXXr1hQrVowqVarwySefAPDTTz/RtGlTQkNDueKKK3Js+i1evJjOnTsTERFBaGgorVu3zrGR8+eff9KmTRtCQkIoX748I0eOJDMzM8e6Jk+eTFxcHGFhYYSHh9O1a1dWrVqVq6/pwud269aNNWvW5OG7YvXDDz8QFxdHaGgoERERdOnSJcem3IwZM2jYsCHBwcFUq1aNN954I8fb2i78N+90Onn++eepXbs2xYoVo3jx4jRs2JA33ngDMG+Ne+SRRwCoWrWq65bFs7e25XTL2MGDB7nxxhuJiIggKiqKAQMG8Oeff+JwOPj0009d513sdrPbbruNKlWqWMYyMjJ4/vnnXZkuXbo0t99+O8eOHcvdN1JERKSAqSEkIiKSz7Kzs5k/fz7NmjWjYsWKuXrOvffey2OPPUaXLl344YcfGDNmDLNnz6Z169bEx8f/61pGjhzJ3r17effdd/nxxx+JiYlxPTZkyBD8/Pz4+uuvef3111m+fDkdOnSw3Gbz5Zdf0rVrVyIjI/nss8/49ttviY6Oplu3bv+6KbRr1y4AatWq5Ro7dOgQGzdupGvXroSGhub4vLi4OGJiYpg7d67rORs2bLjkc3LjbPPs2WefzdX5hw8f5vbbb2fo0KHMmDGDK664gjvuuIPnnnuOkSNH8uijjzJ16lTCw8Pp06cPBw8edD33t99+o1OnTiQkJPDRRx8xadIkIiIiuPbaa5k8ebLrvE2bNtG5c2dOnTrFp59+yrvvvsuaNWt4/vnn3ep58cUXGThwIPXq1ePbb7/l888/JzExkbZt27Jhw4ZLfi0XPveLL74gKSmJtm3bXnLNpov5+uuv6d27N5GRkUyaNImPPvqIkydP0qFDBxYvXuw6b/bs2dxwww2ULFmSyZMnM378eCZNmsRnn332j59j/PjxPPvsswwcOJCffvqJyZMnM2TIENe/26FDh3LfffcBMG3aNJYuXcrSpUtp2rRpjh/v9OnTXH311cyZM4exY8cyZcoUypYty4ABA/L89Z/ldDrp3bs348aN4z//+Q8//fQT48aNY+7cuXTo0IHTp0//648tIiKSbwwRERHJV4cPHzYA46abbsrV+X///bcBGMOGDbOML1u2zACMJ554wjVWuXJlY/DgwW4fo3379kb79u1dxwsWLDAAo127dm7nfvLJJwZgXH/99ZbxP/74wwCM559/3jAMw0hJSTGio6ONa6+91nJedna20ahRI6NFixaX/Lp27dplAMZLL71kZGZmGmlpacZff/1lxMXFGeXKlTN27drlOvfPP/80AOPxxx+/5Mds2bKlUaxYsTw9558sXLjQ8Pf3N0aPHv2P57Zv394AjJUrV7rGjh8/bvj7+xvFihUzDhw44Br/66+/DMCYOHGia6xVq1ZGTEyMkZSU5BrLysoyGjRoYFSoUMFwOp2GYRjGgAEDjGLFihmHDx+2nFenTh0DcH3v9u7dawQEBBj//e9/LXUmJiYaMTExRr9+/Vxjo0aNMs7/0e/sc++77z7Lc5OSkoyyZcsaN9544yW/F2f/jS1YsMAwDPPfRfny5Y0rrrjCyM7Otny8mJgYo3Xr1q6xK6+80qhYsaKRnp5uOa9kyZLGhT+eXvhvvlevXkbjxo0vWdvLL79s+T6d78KsvPPOOwZgzJgxw3LenXfeaQDGJ598ctHnnjV48GCjcuXKruNJkyYZgDF16lTLeStWrDAA4+23375k/SIiIoVBM4RERERstmDBAgC3W8FatGhB3bp1L+v2rL59+170sZtvvtly3Lp1aypXruyqZ8mSJZw4cYLBgweTlZXlenM6nVxzzTWsWLEiVzt4PfbYYwQGBhISEkLjxo3ZsGEDP/74o9stNrlhGEa+r4PUvn17srKyeOaZZ3J1frly5WjWrJnrODo6mpiYGBo3bkz58uVd43Xr1gVwbQWfkpLCsmXL6NevH+Hh4a7z/P39GTRoEPv372fLli2A+W+ic+fOlClTxnLehbNWfvnlF7Kysrjjjjss4xEREXTs2JHffvvtol/H2efeeuutlr/fkJAQ2rdvn+fdw7Zs2cLBgwcZNGgQfn7nfsQMDw+nb9++/Pnnn6SmppKSksLKlSvp06cPQUFBlvOuvfbaf/w8LVq0YO3atQwbNoxffvmFxMTEPNV5oQULFhAREWG5hRPgP//5z7/+mDNnzqR48eJce+21lu9t48aNKVu2bIHvzCYiIpIbWlRaREQkn5UqVYrQ0FDXrVH/5Pjx44DZaLhQ+fLlXQ2FfyOnj3lW2bJlcxw7W8/ZNX769et30Y9x4sQJwsLCLlnDAw88wC233EJ6ejp//vknTz31FL1792bt2rWULFkSMBeeBv7xe7Znzx7XbXi5fU5+i46OdhsLCgpyGz/b7EhLSwPMNZsMw7jo3zOc+7dw/Pjxi/79nO/s31Hr1q3dzj3bvLuYs8+98sorc3z8/KZObvzTv2On0+n6HhiGYWl2nZXT2IVGjhxJWFgYX375Je+
"text/plain": [
"<Figure size 1400x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ROC curve\n",
"\n",
"# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)\n",
"y_pred_prob = logit_grid.predict_proba(X_test)[:, 1]\n",
"\n",
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)\n",
"\n",
"# Calcul de l'aire sous la courbe ROC (AUC)\n",
"roc_auc = auc(fpr, tpr)\n",
"\n",
"plt.figure(figsize = (14, 8))\n",
"plt.plot(fpr, tpr, label=\"ROC curve(area = %0.3f)\" % roc_auc)\n",
"plt.plot([0, 1], [0, 1], color=\"red\",label=\"Random Baseline\", linestyle=\"--\")\n",
"plt.grid(color='gray', linestyle='--', linewidth=0.5)\n",
"plt.xlabel('Taux de faux positifs (FPR)')\n",
"plt.ylabel('Taux de vrais positifs (TPR)')\n",
"plt.title('Courbe ROC : modèle logistique')\n",
"plt.legend(loc=\"lower right\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "3b5c9485-511b-4f6b-b667-154f4f519682",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAHFCAYAAAAOmtghAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABtqklEQVR4nO3deVhU1f8H8PedgRl2EJBVRHDFXSEVzH3BJU2z1K+Wa6aZuWuZvzLNsizNtDTL3bTMtSxz33fBNcEdRVlEQBbZmTm/P5CpCdQZnGFgeL+eh0fnzL133nMF5uO5554jCSEEiIiIiMyEzNQBiIiIiAyJxQ0RERGZFRY3REREZFZY3BAREZFZYXFDREREZoXFDREREZkVFjdERERkVljcEBERkVlhcUNERERmhcUNUQV08eJFDB06FH5+frCysoKdnR2aNm2KuXPnIjk52SSZVq1aBUmSEBYWZpLXP3jwICRJwsGDBw1yvNu3b0OSJHz11VdP3a5atWoYMmSIVtu5c+fQpk0bODo6QpIkLFiwADt27MDHH39skGxE5s7C1AGIqHT9+OOPGD16NGrXro0pU6agbt26yMvLQ1hYGL7//nucOHECW7duNXXMCmPr1q1wcHDQahs2bBgyMjLwyy+/oFKlSqhWrRpmz56N7777jgUOkQ5Y3BBVICdOnMDbb7+NTp06Ydu2bVAqlZrnOnXqhEmTJmHnzp2lmikvLw+SJJXqa5YlTZo0KdL2999/Y8SIEejatasJEhGVf7wsRVSBfPbZZ5AkCT/88INWYVNIoVCgZ8+emsdqtRpz585FnTp1oFQq4ebmhkGDBuHevXta+xV3aQUA2rZti7Zt22oeF176Wbt2LSZNmgRvb28olUrcuHFDs83Dhw8xdOhQODs7w9bWFj169MCtW7eKHHvv3r3o0KEDHBwcYGNjg5YtW2Lfvn06nYcrV66gS5cusLGxgaurK0aNGoX09PRit32e19HFv89d4aW5/Px8LFmyBJIkQZIkDBkyBN999x0AaNokScLt27cNloPInLC4IaogVCoV9u/fj8DAQPj4+Oi0z9tvv4333nsPnTp1wu+//45PPvkEO3fuREhICBITE0ucZdq0aYiOjsb333+P7du3w83NTfPc8OHDIZPJsH79eixYsACnT59G27ZtkZKSotnmp59+QufOneHg4IDVq1fj119/hbOzM0JDQ59ZeNy/fx9t2rTB33//jcWLF2Pt2rV49OgRxowZU2Tb53mdkujevTtOnDgBAHj11Vdx4sQJnDhxAh9++CFeffVVANC0nThxAp6engbPQGQWBBFVCPHx8QKA6N+/v07bR0ZGCgBi9OjRWu2nTp0SAMQHH3ygafP19RWDBw8ucow2bdqINm3aaB4fOHBAABCtW7cusu3KlSsFANG7d2+t9mPHjgkAYvbs2UIIITIyMoSzs7Po0aOH1nYqlUo0atRINGvW7Knv67333hOSJInz589rtXfq1EkAEAcOHDDI60RFRQkA4ssvv3zqdsWdOwDinXfe0Wp75513BH9lE+mGPTdEVKwDBw4AQJHLTc2aNUNAQMBz9Vz06dPnic8NHDhQ63FISAh8fX01eY4fP47k5GQMHjwY+fn5mi+1Wo0uXbrgzJkzyMjIeOLxDxw4gHr16qFRo0Za7QMGDNB6/LyvQ0SmwwHFRBWEq6srbGxsEBUVpdP2SUlJAFDspQ8vLy/cuXOnxFmedjnFw8Oj2LbCPPfv3wcAzWWa4iQnJ8PW1rbY55KSkuDn5/fM133e1yEi02FxQ1RByOVydOjQAX/99Rfu3buHKlWqPHV7FxcXAEBcXFyRbWNjY+Hq6qp5bGVlhZycnCLHSExM1Nqu0NPujoqPjy+2rUaNGgCgOd6iRYvQokWLYo/h7u7+xOO7uLg88TX+7Xlfh4hMh5eliCqQadOmQQiBESNGIDc3t8jzeXl52L59OwCgffv2AAoG1f7bmTNnEBkZiQ4dOmjaqlWrhosXL2ptd+3aNVy9elXvjOvWrdN6fPz4cdy5c0dz11XLli3h5OSEiIgIBAUFFfulUCieePx27drh8uXLuHDhglb7+vXrtR4/7+sYWuHdbVlZWaX2mkTlFXtuiCqQ4OBgLFmyBKNHj0ZgYCDefvtt1KtXD3l5eTh37hx++OEH1K9fHz169EDt2rXx1ltvYdGiRZDJZOjatStu376NDz/8ED4+PpgwYYLmuG+88QZef/11jB49Gn369MGdO3cwd+5cVK5cWe+MYWFhePPNN/Haa6/h7t27mD59Ory9vTF69GgAgJ2dHRYtWoTBgwcjOTkZr776Ktzc3PDgwQNcuHABDx48wJIlS554/PHjx2PFihXo3r07Zs+eDXd3d6xbtw5XrlzR2u55X6fQpUuXsGnTpiLtL7zwAnx9fXU+Lw0aNAAAfPHFF+jatSvkcjkaNmxYqgUWUblh6hHNRFT6zp8/LwYPHiyqVq0qFAqFsLW1FU2aNBEfffSRSEhI0GynUqnEF198IWrVqiUsLS2Fq6ureP3118Xdu3e1jqdWq8XcuXOFv7+/sLKyEkFBQWL//v1PvFtq48aNRTIV3i21e/du8cYbbwgnJydhbW0tunXrJq5fv15k+0OHDonu3bsLZ2dnYWlpKby9vUX37t2LPfZ/RUREiE6dOgkrKyvh7Owshg8fLn777Tetu6We93UK75Z60tfKlSuFELrfLZWTkyPefPNNUblyZSFJkgAgoqKinvleiSoiSQghTFNWERERERkex9wQERGRWWFxQ0RERGaFxQ0RERGZFRY3REREZFZY3BAREZFZYXFDREREZqXCTeKnVqsRGxsLe3v7p04BT0RERGWHEALp6enw8vKCTPb0vpkKV9zExsbCx8fH1DGIiIioBO7evfvMtfEqXHFjb28PoODkODg4mDgNERER6SItLQ0+Pj6az/GnqXDFTeGlKAcHBxY3RERE5YwuQ0o4oJiIiIjMCosbIiIiMissboiIiMisVLgxN7pSqVTIy8szdQwqRywtLSGXy00dg4iowmNx8x9CCMTHxyMlJcXUUagccnJygoeHB+dQIiIyIRY3/1FY2Li5ucHGxoYfUqQTIQQyMzORkJAAAPD09DRxIiKiiovFzb+oVCpNYePi4mLqOFTOWFtbAwASEhLg5ubGS1RERCbCAcX/UjjGxsbGxsRJqLwq/N7heC0iItNhcVMMXoqikuL3DhGR6bG4ISIiIrNi0uLm8OHD6NGjB7y8vCBJErZt2/bMfQ4dOoTAwEBYWVnB398f33//vfGDkk50/Td8XgcPHoQkSc91R9vHH3+Mxo0bax4PGTIEvXr10jwWQuCtt96Cs7MzJEnC+fPnS/xaRERUukxa3GRkZKBRo0b49ttvddo+KioK3bp1Q6tWrXDu3Dl88MEHGDt2LDZv3mzkpOVDfHw83n33Xfj7+0OpVMLHxwc9evTAvn37TB2tzPvmm2+watUqzeOdO3di1apV+OOPPxAXF4f69euXWvFGRETPx6R3S3Xt2hVdu3bVefvvv/8eVatWxYIFCwAAAQEBCAsLw1dffYU+ffoYKWX5cPv2bbRs2RJOTk6YO3cuGjZsiLy8POzatQvvvPMOrly5YrTXzs3NhUKhMNrxS4Ojo6PW45s3b8LT0xMhISEmSkREVP7k5qvxMDMXWbkqVHO1NVmOcnUr+IkTJ9C5c2etttDQUCxfvhx5eXmwtLQssk9OTg5ycnI0j9PS0oye0xRGjx4NSZJw+vRp2Nr+8w1Vr149DBs2TPM4Ojoa7777Lvbt2weZTIYuXbpg0aJFcHd3B1BweSYlJUWrh2L8+PE4f/48Dh48CABo27Yt6tevD4VCgTVr1qBevXo4dOgQACAuLg5du3bFwYMH4eHhgblz5+K1117THCsmJgYTJ07E7t27IZPJ8OKLL+Kbb75BtWrVnvjeduzYgfHjx+Pu3bto0aIFBg8eXGSb48eP4/3338eZM2fg6uqK3r17Y86cOVrn4mn+/b6HDBmC1atXAyi41Obr66vZrnfv3gAAX19f3L59W6djExGVB0II5KrUyMhRIU+lRm6+Grk
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# utilisation d'une métrique plus adaptée aux modèles de marketing : courbe de lift\n",
"\n",
"# Tri des prédictions de probabilités et des vraies valeurs\n",
"sorted_indices = np.argsort(y_pred_prob)[::-1]\n",
"y_pred_prob_sorted = y_pred_prob[sorted_indices]\n",
"y_test_sorted = y_test.iloc[sorted_indices]\n",
"\n",
"# Calcul du gain cumulatif\n",
"cumulative_gain = np.cumsum(y_test_sorted) / np.sum(y_test_sorted)\n",
"\n",
"# Tracé de la courbe de lift\n",
"plt.plot(np.linspace(0, 1, len(cumulative_gain)), cumulative_gain, label='Courbe de lift')\n",
"plt.xlabel('Part de clients identifiés sans modèle ')\n",
"plt.ylabel('Part de clients identifiés avec modèle')\n",
"plt.title('Courbe de Lift')\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "6e7cfb6c-8049-4bd1-8d82-61a2e97b257d",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjoAAAGdCAYAAAAbudkLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAmeElEQVR4nO3df1BVd37/8dcNv0QGTkECl7uyxnSU1cVNW9zww93VJAa0InWTjrZ07mjHxWSMEio01bXTmM5GsjFqunVjrWNj1mBwusZsOrgsZJIQWUUNldkQretutMIExB94AWMvhJzvHzueb64Y4yVckQ/Px8yZ4Zzzvue+D5/B+/Jzz7nXZdu2LQAAAAPdNdwNAAAAhApBBwAAGIugAwAAjEXQAQAAxiLoAAAAYxF0AACAsQg6AADAWAQdAABgrPDhbmA4ffbZZ/r4448VGxsrl8s13O0AAIBbYNu2uru75fF4dNddN5+zGdVB5+OPP1ZqaupwtwEAAAahpaVF48ePv2nNqA46sbGxkv7wi4qLixvmbgAAwK3o6upSamqq8zp+M6M66Fx7uyouLo6gAwDACHMrl51wMTIAADAWQQcAABiLoAMAAIxF0AEAAMYi6AAAAGMRdAAAgLEIOgAAwFgEHQAAYCyCDgAAMBZBBwAAGIugAwAAjEXQAQAAxiLoAAAAYxF0AACAscKHuwGT3bO6KiTHPfPcvJAcFwAA0zCjAwAAjEXQAQAAxiLoAAAAYxF0AACAsQg6AADAWAQdAABgLIIOAAAwVlBBp7y8XN/+9rcVGxurpKQkLViwQCdPngyoWbJkiVwuV8CSlZUVUOP3+7Vy5UolJiYqJiZGBQUFam1tDajp7OyU1+uVZVmyLEter1eXL18OqDl79qzmz5+vmJgYJSYmqri4WL29vcGcEgAAMFhQQaeurk5PPPGEGhoaVFtbq08//VS5ubm6cuVKQN2cOXPU1tbmLPv37w/YX1JSon379qmyslL19fXq6elRfn6++vv7nZrCwkI1NTWpurpa1dXVampqktfrdfb39/dr3rx5unLliurr61VZWam9e/eqtLR0ML8HAABgoKA+Gbm6ujpg/eWXX1ZSUpIaGxv1ve99z9keFRUlt9t9w2P4fD7t2LFDu3bt0uzZsyVJr776qlJTU/XWW28pLy9PJ06cUHV1tRoaGpSZmSlJ2r59u7Kzs3Xy5EmlpaWppqZGx48fV0tLizwejyRp48aNWrJkiZ599lnFxcUFc2oAAMBAX+kaHZ/PJ0lKSEgI2P7uu+8qKSlJkydPVlFRkTo6Opx9jY2N6uvrU25urrPN4/EoPT1dBw8elCQdOnRIlmU5IUeSsrKyZFlWQE16eroTciQpLy9Pfr9fjY2NN+zX7/erq6srYAEAAOYadNCxbVurVq3Sd77zHaWnpzvb586dq4qKCr399tvauHGjjh49qgcffFB+v1+S1N7ersjISMXHxwccLzk5We3t7U5NUlLSgOdMSkoKqElOTg7YHx8fr8jISKfmeuXl5c41P5ZlKTU1dbCnDwAARoBBf6nnihUr9Jvf/Eb19fUB2xctWuT8nJ6erunTp2vChAmqqqrSI4888oXHs21bLpfLWf/8z1+l5vPWrFmjVatWOetdXV2EHQAADDaoGZ2VK1fqzTff1DvvvKPx48fftDYlJUUTJkzQqVOnJElut1u9vb3q7OwMqOvo6HBmaNxut86dOzfgWOfPnw+ouX7mprOzU319fQNmeq6JiopSXFxcwAIAAMwVVNCxbVsrVqzQ66+/rrffflsTJ0780sdcvHhRLS0tSklJkSRlZGQoIiJCtbW1Tk1bW5uam5uVk5MjScrOzpbP59ORI0ecmsOHD8vn8wXUNDc3q62tzampqalRVFSUMjIygjktAABgqKDeunriiSe0e/du/eIXv1BsbKwzo2JZlqKjo9XT06N169bp0UcfVUpKis6cOaMf/vCHSkxM1Pe//32ndunSpSotLdW4ceOUkJCgsrIyTZs2zbkLa8qUKZozZ46Kioq0bds2SdKyZcuUn5+vtLQ0SVJubq6mTp0qr9erDRs26NKlSyorK1NRUREzNQAAQFKQMzpbt26Vz+fTrFmzlJKS4ix79uyRJIWFhemDDz7QX/zFX2jy5MlavHixJk+erEOHDik2NtY5zubNm7VgwQItXLhQM2bM0NixY/Vf//VfCgsLc2oqKio0bdo05ebmKjc3V9/61re0a9cuZ39YWJiqqqo0ZswYzZgxQwsXLtSCBQv0wgsvfNXfCQAAMITLtm17uJsYLl1dXbIsSz6fLySzQPesrhryY0rSmefmheS4AACMBMG8fvNdVwAAwFgEHQAAYCyCDgAAMBZBBwAAGIugAwAAjEXQAQAAxiLoAAAAYxF0AACAsQg6AADAWAQdAABgLIIOAAAwFkEHAAAYi6ADAACMRdABAADGIugAAABjEXQAAICxCDoAAMBYBB0AAGAsgg4AADAWQQcAABiLoAMAAIxF0AEAAMYi6AAAAGMRdAAAgLEIOgAAwFgEHQAAYCyCDgAAMBZBBwAAGIugAwAAjEXQAQAAxiLoAAAAYxF0AACAsQg6AADAWAQdAABgLIIOAAAwFkEHAAAYi6ADAACMRdABAADGIugAAABjEXQAAICxCDoAAMBYBB0AAGAsgg4AADAWQQcAABiLoAMAAIxF0AEAAMYi6AAAAGMRdAAAgLEIOgAAwFgEHQAAYCyCDgAAMBZBBwAAGIugAwAAjEXQAQAAxiLoAAAAYxF0AACAsQg6AADAWEEFnfLycn37299WbGyskpKStGDBAp08eTKgxrZtrVu3Th6PR9HR0Zo1a5Y+/PDDgBq/36+VK1cqMTFRMTExKigoUGtra0BNZ2envF6vLMuSZVnyer26fPlyQM3Zs2c1f/58xcTEKDExUcXFxert7Q3mlAAAgMGCCjp1dXV64okn1NDQoNraWn366afKzc3VlStXnJrnn39emzZt0pYtW3T06FG53W49/PDD6u7udmpKSkq0b98+VVZWqr6+Xj09PcrPz1d/f79TU1hYqKamJlVXV6u6ulpNTU3yer3O/v7+fs2bN09XrlxRfX29KisrtXfvXpWWln6V3wcAADCIy7Zte7APPn/+vJKSklRXV6fvfe97sm1bHo9HJSUl+od/+AdJf5i9SU5O1o9//GM99thj8vl8uvvuu7Vr1y4tWrRIkvTxxx8rNTVV+/fvV15enk6cOKGpU6eqoaFBmZmZkqSGhgZlZ2frf/7nf5SWlqZf/vKXys/PV0tLizwejySpsrJSS5YsUUdHh+Li4r60/66uLlmWJZ/Pd0v1wbpnddWQH1OSzjw3LyTHBQBgJAjm9fsrXaPj8/kkSQkJCZKk06dPq729Xbm5uU5NVFSUZs6cqYMHD0qSGhsb1dfXF1Dj8XiUnp7u1Bw6dEiWZTkhR5KysrJkWVZATXp6uhNyJCkvL09+v1+NjY037Nfv96urqytgAQAA5hp00LFtW6tWrdJ3vvMdpaenS5La29slScnJyQG1ycnJzr729nZFRkYqPj7+pjVJSUkDnjMpKSmg5vrniY+PV2RkpFNzvfLycueaH8uylJqaGuxpAwCAEWTQQWfFihX6zW9+o9dee23APpfLFbBu2/aAbde7vuZG9YOp+bw1a9bI5/M5S0tLy017AgAAI9uggs7KlSv15ptv6p133tH48eOd7W63W5IGzKh0dHQ4sy9ut1u9vb3q7Oy8ac25c+cGPO/58+cDaq5/ns7OTvX19Q2Y6bkmKipKcXFxAQsAADBXUEHHtm2tWLFCr7/+ut5++21NnDgxYP/EiRPldrtVW1vrbOvt7VVdXZ1ycnIkSRkZGYqIiAioaWtrU3Nzs1OTnZ0tn8+nI0eOODWHDx+Wz+cLqGlublZbW5tTU1NTo6ioKGVkZARzWgAAwFDhwRQ/8cQT2r17t37xi18oNjbWmVGxLEvR0dFyuVwqKSnR+vXrNWnSJE2aNEnr16/X2LFjVVhY6NQuXbpUpaWlGjdunBISElRWVqZp06Zp9uzZkqQpU6Zozpw5Kioq0rZt2yRJy5Y
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# look at the distribution of the score \n",
"\n",
"plt.hist(y_pred_prob, bins=20)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "99f7f70e-c3bb-445e-8889-e7547f6ebd1e",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAq0AAAHFCAYAAADYE8ABAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABpx0lEQVR4nO3dd3hTZf8G8DttmnSX7j1ZLatAC1iGyBYQQUGQPUQZIiLqK8hPpsoLKi+CDFFAQcDKEAFZlY2AMsresy3dLd07eX5/lERDW2ih7UnS+3NdvaBPTpLvyUnOufvkOc+RCSEEiIiIiIj0mInUBRARERERPQlDKxERERHpPYZWIiIiItJ7DK1EREREpPcYWomIiIhI7zG0EhEREZHeY2glIiIiIr3H0EpEREREeo+hlYiIiIj03lOF1vPnz2PkyJHw9/eHubk5rK2t0bx5c8yfPx+pqana5V544QXIZDLIZDKYmJjAxsYGderUwWuvvYZNmzZBrVaXeGw/Pz/tfR79ycrKevo1NWKRkZFo37497OzsIJPJsHDhwlKXy8nJwcyZM3Hw4MESt82cORMymQzJyclVW6weuHv3LmQyGX744Qdtm2b9K2r9+vVlvt7Pys/PDyNGjKiSx35afn5+eOmllyr1MWUyGSZMmPDE5Q4ePAiZTKbz/i1tu73wwgt44YUXtL8/7n1fXcr7GdVHP/zwA2QyGU6dOiV1KRW2Zs0aODs7IzMzU+pSntr169ehUChw5syZct8nPDwcDRs2hIWFBWQyGc6ePVt1Beqhzz//HFu3bpW6DKOk2R/cvXtXkueXV/QO3333HcaPH4/69evjww8/RIMGDVBYWIhTp05h+fLlOH78OH799Vft8gEBAVi3bh0AIDs7G3fu3MHWrVvx2muvoV27dti+fTvs7Ox0nqNNmzb48ssvSzy3paVlRcutEUaNGoXs7Gz8/PPPsLe3h5+fX6nL5eTkYNasWQCgc1AnYPTo0XjxxRcrfL/169fj4sWLmDRpUuUXRTqaN2+O48ePo0GDBo9dbunSpTq/68P7vryfUao8OTk5+Pjjj/HRRx/BxsZG6nKeWr169TB48GC89957OHTo0BOXT0pKwtChQ/Hiiy9i6dKlUCqVqFevXjVUqj8+//xz9OvXD3369JG6FKPTs2dPHD9+HO7u7pI8f4VC6/HjxzFu3Dh06dIFW7duhVKp1N7WpUsXvP/++9i9e7fOfSwsLPDcc8/ptI0ePRqrV6/GqFGj8NZbbyE8PFzn9lq1apW4jz7IycnRy+B88eJFvPnmm+jevbvUpVS53NxcWFhYVPrjenl5wcvLq9IfV9/p63u6NLa2tuXaLzwp1EpB3z6jQgjk5eVVyWdJX/z4449ISUnB6NGjpS5Fx9O89hMmTEBoaCiOHTuG1q1bP3bZ69evo7CwEEOGDEH79u2ftVwAhrOfqKrjA/3D2dkZzs7Okj1/hYYHfP7555DJZFixYoVOYNVQKBR4+eWXy/VYI0eORI8ePbBx40bcu3evImWUKS0tDe+//z4CAgKgVCrh4uKCHj164OrVqwBK/3oRKP3r4hEjRsDa2hoXLlxA165dYWNjg06dOmHSpEmwsrJCRkZGiecfMGAAXF1dUVhYqG0LDw9HWFgYrKysYG1tjW7duiEyMrJc63Px4kX07t0b9vb2MDc3R9OmTfHjjz9qb9d00xcVFWHZsmXaYRSluXv3rvaNNmvWLO2yj379nJCQgIEDB8LOzg6urq4YNWoU0tPTdZYRQmDp0qVo2rQpLCwsYG9vj379+uH27dtPXCfN17mRkZF49dVXYWtrCzs7OwwZMgRJSUk6y2q+it6yZQuaNWsGc3NzbY9ZfHw8xowZAy8vLygUCvj7+2PWrFkoKirSeYzY2Fj0798fNjY2sLOzw4ABAxAfH19mXY9av349wsLCYG1tDWtrazRt2hQrV64EUNxr9/vvv+PevXs6w1g0CgoK8OmnnyIwMBBKpRLOzs4YOXJkifUsLCzEf/7zH7i5ucHS0hJt27bF33///cTXEvjnvTt//nx89tln8PHxgbm5OUJDQ7Fv375S1/HMmTPo168f7O3tUbt2bQBAXl4epk6dCn9/fygUCnh6euLtt99GWlpaqc/766+/okmTJjA3N0dAQAAWLVqkc3teXh7ef/99NG3aFHZ2dnBwcEBYWBh+++23Mtfl22+/Rb169aBUKtGgQQP8/PPPOreX9fl91L+HBzzufX/kyBHIZDJs2LChxGOsWbMGMpkMJ0+efOxzVeZnVFNvebfniBEjSu2xLe29rBmCsXz5cgQFBUGpVGrrvHr1KgYOHAhXV1colUr4+Phg2LBhyM/P13mMzMxMjBs3Dk5OTnB0dMSrr76K2NhYnWXCw8PRtWtXuLu7w8LCAkFBQZgyZQqys7N1lrt9+zZef/11eHh4QKlUwtXVFZ06dSrxNfaz7D+XLVuGXr16oVatWjrtGzduRKtWrWBnZwdLS0sEBARg1KhROss86VgCAKmpqRg/fjw8PT2hUCgQEBCAadOmlXjdHvfa37hxA4MGDYKLiwuUSiWCgoKwZMmSEusSEhKCoKAgLF++/LHrPGLECLRt2xZA8fFIJpPpfLuwbds2hIWFwdLSEjY2NujSpQuOHz+u8xiP20+UJicnBx988IF2uKCDgwNCQ0N1Plea4+mlS5fQqVMnWFlZwdnZGRMmTEBOTo7O45V3X1TW8UEmkyE7Oxs//vij9vOmeQ3KU2tZ7t+/j7feegve3t5QKBTw8PBAv379kJCQoF0mKioKQ4YM0dmeX331lc5QSM1n/IsvvsC8efPg5+cHCwsLvPDCC9o/OKZMmQIPDw/Y2dnhlVdeQWJiYqnrXpn7Yc37dO3atQgKCoKlpSWCg4OxY8cOneXKGh7wxx9/oFOnTrC1tYWlpSXatGlTYp+VlJSkfQ01x8Q2bdrgjz/+eOLrryXKqaioSFhaWopWrVqV9y6iffv2omHDhmXevnz5cgFArF27Vtvm6+srevToIQoLC3V+VCrVY58rIyNDNGzYUFhZWYnZs2eLPXv2iM2bN4t3331X7N+/XwghxIEDBwQAceDAAZ373rlzRwAQq1ev1rYNHz5cmJmZCT8/PzF37lyxb98+sWfPHnHu3DkBQHz33Xc6j/HgwQOhVCrF5MmTtW2fffaZkMlkYtSoUWLHjh1iy5YtIiwsTFhZWYlLly49dn2uXr0qbGxsRO3atcWaNWvE77//LgYOHCgAiHnz5gkhhEhMTBTHjx8XAES/fv3E8ePHxfHjx0t9vLy8PLF7924BQLzxxhvaZW/evCmEEGLGjBkCgKhfv76YPn26iIiIEAsWLBBKpVKMHDlS57HefPNNYWZmJt5//32xe/dusX79ehEYGChcXV1FfHz8Y9dL8zy+vr7iww8/FHv27BELFiwQVlZWolmzZqKgoEC7rK+vr3B3dxcBAQFi1apV4sCBA+Lvv/8WcXFxwtvbW/j6+opvv/1W/PHHH2LOnDlCqVSKESNGaO+fk5MjgoKChJ2dnVi8eLHYs2ePmDhxovDx8SmxvTV1/dsnn3wiAIhXX31VbNy4Uezdu1csWLBAfPLJJ0IIIS5duiTatGkj3NzctK+n5vVXqVTixRdfFFZWVmLWrFkiIiJCfP/998LT01M0aNBA5OTkaJ9n+PDhQiaTiQ8//FD7HJ6ensLW1lYMHz78sa+n5r3r7e0t2rZtKzZv3iw2btwoWrRoIczMzMSxY8dKfe0/+ugjERERIbZu3SrUarXo1q2bkMvl4pNPPhF79+4VX375pXab5OXl6WwTT09P4ePjI1atWiV27twpBg8eLACIL774QrtcWlqaGDFihFi7dq3Yv3+/2L17t/jggw+EiYmJ+PHHH3XWQVN/gwYNxIYNG8S2bdvEiy++KACIjRs3apcr7fNb2nZr3769aN++vRDiye/7Zs2aiTZt2pR4XVu0aCFatGjx2Ne+sj+jQlRsew4fPlz4+vqWeIzSXhMAwtPTUzRp0kSsX79e7N+/X1y8eFGcPXtWWFtbCz8/P7F8+XKxb98+8dNPP4n+/fuLjIw
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# number of observations\n",
"N = len(y_pred_prob)\n",
"\n",
"# sort the data in ascending order \n",
"y_pred_prob_sorted = np.sort(y_pred_prob) \n",
"\n",
"# get the cdf values of y \n",
"steps = np.arange(N) / N\n",
" \n",
"# plotting \n",
"plt.xlabel('X') \n",
"plt.ylabel('P(score<=X)') \n",
" \n",
"plt.title('CDF curve of the predicted probability of purchase (score) for sports companies') \n",
" \n",
"plt.plot(y_pred_prob_sorted, steps) \n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "bcb94066-9387-4a5f-af3a-ab86d534c885",
"metadata": {},
"source": [
"### K-means clustering"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "dd7a4a9c-d7e3-4747-ae59-b2a5a0b77260",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-4 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-4 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-4 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-4 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-4 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-4 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-4 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-4 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-4 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-4 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-4 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-4 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-4 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-4 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-4 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-4 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-4 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-4 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-4\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>KMeans(n_clusters=3, random_state=0)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-23\" type=\"checkbox\" checked><label for=\"sk-estimator-id-23\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;KMeans<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.cluster.KMeans.html\">?<span>Documentation for KMeans</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>KMeans(n_clusters=3, random_state=0)</pre></div> </div></div></div></div>"
],
"text/plain": [
"KMeans(n_clusters=3, random_state=0)"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# K-means clustering \n",
"\n",
"from sklearn.cluster import KMeans\n",
"\n",
"kmeans = KMeans(n_clusters=3, random_state=0)\n",
"\n",
"kmeans.fit(y_pred_prob.reshape(-1,1))"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "10b6ece7-adcf-41c0-884b-a4aef42af378",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 0, 0, ..., 0, 1, 0], dtype=int32)"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_clusters = kmeans.predict(y_pred_prob.reshape(-1,1))\n",
"y_clusters"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "e4b3b16e-03b8-4883-9788-cb7296fe56cd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"seuil cluster 0 : 0.38635624748849917 (60.14%)\n",
"seuil cluster 1 : 0.7395110401019087 (30.69%)\n",
"seuil cluster 2 : 1.0 (9.16%)\n"
]
}
],
"source": [
"# seuils des clusters et part de clients dans chacun d'eux\n",
"\n",
"print(f\"seuil cluster 0 : {y_pred_prob[y_clusters==0].max()} ({round(100 * (y_clusters==0).mean(), 2)}%)\")\n",
"print(f\"seuil cluster 1 : {y_pred_prob[y_clusters==1].max()} ({round(100 * (y_clusters==1).mean(), 2)}%)\")\n",
"print(f\"seuil cluster 2 : {y_pred_prob[y_clusters==2].max()} ({round(100* (y_clusters==2).mean(), 2)}%)\")"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "3e404a5e-6734-4d98-8853-48b09c96e7e0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>cluster</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers vente_internet_max \\\n",
"0 4.0 1.0 100.0 1.0 0.0 \n",
"1 1.0 1.0 55.0 1.0 0.0 \n",
"2 17.0 1.0 80.0 1.0 0.0 \n",
"3 4.0 1.0 120.0 1.0 0.0 \n",
"4 34.0 2.0 416.0 1.0 0.0 \n",
"\n",
" purchase_date_min purchase_date_max nb_tickets_internet is_email_true \\\n",
"0 5.177187 5.177187 0.0 True \n",
"1 426.265613 426.265613 0.0 True \n",
"2 436.033437 436.033437 0.0 True \n",
"3 5.196412 5.196412 0.0 True \n",
"4 478.693148 115.631470 0.0 True \n",
"\n",
" opt_in gender_female gender_male nb_campaigns nb_campaigns_opened \\\n",
"0 False 1 0 0.0 0.0 \n",
"1 True 0 1 0.0 0.0 \n",
"2 True 1 0 0.0 0.0 \n",
"3 False 1 0 0.0 0.0 \n",
"4 False 1 0 0.0 0.0 \n",
"\n",
" cluster \n",
"0 1 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 2 "
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# les individus des clusters sont-ils semblables ? def des marketing personae\n",
"\n",
"X_test_clustered = X_test.assign(cluster = y_clusters)\n",
"X_test_clustered.head()"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "b6f4638d-23c4-427a-88a4-b09528b3f91b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" </tr>\n",
" <tr>\n",
" <th>cluster</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>7.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>60.00</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>222.437500</td>\n",
" <td>214.639152</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13.0</td>\n",
" <td>4.0</td>\n",
" <td>209.26</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>418.270723</td>\n",
" <td>56.167392</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>18.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"cluster \n",
"0 0.0 0.0 0.00 0.0 \n",
"1 2.0 1.0 60.00 1.0 \n",
"2 13.0 4.0 209.26 1.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"cluster \n",
"0 0.0 550.000000 550.000000 \n",
"1 1.0 222.437500 214.639152 \n",
"2 1.0 418.270723 56.167392 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"cluster \n",
"0 0.0 1.0 1.0 0.0 \n",
"1 1.0 1.0 0.0 0.0 \n",
"2 3.0 1.0 0.0 0.0 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened \n",
"cluster \n",
"0 0.0 7.0 0.0 \n",
"1 1.0 3.0 0.0 \n",
"2 1.0 18.0 1.0 "
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_clustered.groupby(\"cluster\").median().iloc[[0,1,2], :]"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "f80474be-c897-47f9-8fdd-f2fb8d724ee2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" </tr>\n",
" <tr>\n",
" <th>cluster</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.311325</td>\n",
" <td>0.114404</td>\n",
" <td>6.707697</td>\n",
" <td>0.102898</td>\n",
" <td>0.048741</td>\n",
" <td>527.762945</td>\n",
" <td>527.621410</td>\n",
" <td>0.137313</td>\n",
" <td>1.000000</td>\n",
" <td>0.561640</td>\n",
" <td>0.239934</td>\n",
" <td>0.450610</td>\n",
" <td>12.881201</td>\n",
" <td>2.163647</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.926055</td>\n",
" <td>1.395389</td>\n",
" <td>82.976104</td>\n",
" <td>1.000136</td>\n",
" <td>0.681539</td>\n",
" <td>228.303268</td>\n",
" <td>217.641649</td>\n",
" <td>1.736769</td>\n",
" <td>0.990202</td>\n",
" <td>0.145618</td>\n",
" <td>0.260553</td>\n",
" <td>0.536871</td>\n",
" <td>9.821800</td>\n",
" <td>2.811663</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>44.841472</td>\n",
" <td>11.576993</td>\n",
" <td>1942.145881</td>\n",
" <td>1.493641</td>\n",
" <td>0.742562</td>\n",
" <td>382.346041</td>\n",
" <td>87.811798</td>\n",
" <td>12.613786</td>\n",
" <td>0.971724</td>\n",
" <td>0.132637</td>\n",
" <td>0.199182</td>\n",
" <td>0.621735</td>\n",
" <td>20.781399</td>\n",
" <td>8.329548</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"cluster \n",
"0 0.311325 0.114404 6.707697 0.102898 \n",
"1 2.926055 1.395389 82.976104 1.000136 \n",
"2 44.841472 11.576993 1942.145881 1.493641 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"cluster \n",
"0 0.048741 527.762945 527.621410 \n",
"1 0.681539 228.303268 217.641649 \n",
"2 0.742562 382.346041 87.811798 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"cluster \n",
"0 0.137313 1.000000 0.561640 0.239934 \n",
"1 1.736769 0.990202 0.145618 0.260553 \n",
"2 12.613786 0.971724 0.132637 0.199182 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened \n",
"cluster \n",
"0 0.450610 12.881201 2.163647 \n",
"1 0.536871 9.821800 2.811663 \n",
"2 0.621735 20.781399 8.329548 "
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_clustered.groupby(\"cluster\").mean().iloc[[0,1,2], :]"
]
},
{
"cell_type": "markdown",
"id": "d0af77f8-ae66-43a5-bf04-b26667f911f6",
"metadata": {},
"source": [
"### Quartile clustering"
]
},
{
"cell_type": "code",
"execution_count": 82,
"id": "2396ec51-4411-4fe3-9d41-449c4ffa75a0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.695913</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.244205</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.279592</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.696135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.911844</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers vente_internet_max \\\n",
"0 4.0 1.0 100.0 1.0 0.0 \n",
"1 1.0 1.0 55.0 1.0 0.0 \n",
"2 17.0 1.0 80.0 1.0 0.0 \n",
"3 4.0 1.0 120.0 1.0 0.0 \n",
"4 34.0 2.0 416.0 1.0 0.0 \n",
"\n",
" purchase_date_min purchase_date_max nb_tickets_internet is_email_true \\\n",
"0 5.177187 5.177187 0.0 True \n",
"1 426.265613 426.265613 0.0 True \n",
"2 436.033437 436.033437 0.0 True \n",
"3 5.196412 5.196412 0.0 True \n",
"4 478.693148 115.631470 0.0 True \n",
"\n",
" opt_in gender_female gender_male nb_campaigns nb_campaigns_opened \\\n",
"0 False 1 0 0.0 0.0 \n",
"1 True 0 1 0.0 0.0 \n",
"2 True 1 0 0.0 0.0 \n",
"3 False 1 0 0.0 0.0 \n",
"4 False 1 0 0.0 0.0 \n",
"\n",
" score \n",
"0 0.695913 \n",
"1 0.244205 \n",
"2 0.279592 \n",
"3 0.696135 \n",
"4 0.911844 "
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# quartile clustering\n",
"\n",
"X_test[\"score\"] = y_pred_prob\n",
"X_test.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bccddbd1-9d63-4d22-a3b3-daa6d83e90de",
"metadata": {},
"outputs": [],
"source": [
"df['new_column'] = np.where(df['col2']<9, 'value1',\n",
" np.where(df['col2']<12, 'value2',\n",
" np.where(df['col2']<15, 'value3', 'value4')))"
]
},
{
"cell_type": "code",
"execution_count": 83,
"id": "f6334f99-725e-4e94-af86-60f161dd93a8",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.695913</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.244205</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.279592</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.696135</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.911844</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>60.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.140069</td>\n",
" <td>5.140069</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690015</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>5.0</td>\n",
" <td>1.0</td>\n",
" <td>61.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>105.053773</td>\n",
" <td>105.053773</td>\n",
" <td>5.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.663391</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>80.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>63.206030</td>\n",
" <td>63.206030</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.441604</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>10.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>44.698090</td>\n",
" <td>44.698090</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.441933</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>165.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>266.012106</td>\n",
" <td>258.012106</td>\n",
" <td>3.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.581348</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers vente_internet_max \\\n",
"0 4.0 1.0 100.0 1.0 0.0 \n",
"1 1.0 1.0 55.0 1.0 0.0 \n",
"2 17.0 1.0 80.0 1.0 0.0 \n",
"3 4.0 1.0 120.0 1.0 0.0 \n",
"4 34.0 2.0 416.0 1.0 0.0 \n",
"5 2.0 1.0 60.0 1.0 0.0 \n",
"6 5.0 1.0 61.0 1.0 1.0 \n",
"7 4.0 1.0 80.0 1.0 0.0 \n",
"8 1.0 1.0 10.0 1.0 0.0 \n",
"9 3.0 3.0 165.0 1.0 1.0 \n",
"\n",
" purchase_date_min purchase_date_max nb_tickets_internet is_email_true \\\n",
"0 5.177187 5.177187 0.0 True \n",
"1 426.265613 426.265613 0.0 True \n",
"2 436.033437 436.033437 0.0 True \n",
"3 5.196412 5.196412 0.0 True \n",
"4 478.693148 115.631470 0.0 True \n",
"5 5.140069 5.140069 0.0 True \n",
"6 105.053773 105.053773 5.0 True \n",
"7 63.206030 63.206030 0.0 True \n",
"8 44.698090 44.698090 0.0 True \n",
"9 266.012106 258.012106 3.0 True \n",
"\n",
" opt_in gender_female gender_male nb_campaigns nb_campaigns_opened \\\n",
"0 False 1 0 0.0 0.0 \n",
"1 True 0 1 0.0 0.0 \n",
"2 True 1 0 0.0 0.0 \n",
"3 False 1 0 0.0 0.0 \n",
"4 False 1 0 0.0 0.0 \n",
"5 False 0 1 0.0 0.0 \n",
"6 False 0 0 0.0 0.0 \n",
"7 True 0 1 0.0 0.0 \n",
"8 True 0 0 0.0 0.0 \n",
"9 False 0 0 0.0 0.0 \n",
"\n",
" score quartile \n",
"0 0.695913 3 \n",
"1 0.244205 1 \n",
"2 0.279592 2 \n",
"3 0.696135 3 \n",
"4 0.911844 4 \n",
"5 0.690015 3 \n",
"6 0.663391 3 \n",
"7 0.441604 2 \n",
"8 0.441933 2 \n",
"9 0.581348 3 "
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n",
" np.where(X_test['score']<0.5, '2',\n",
" np.where(X_test['score']<0.75, '3', '4')))\n",
"X_test.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "0abec0ed-098b-4ecc-b6c3-6b25110c1493",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
"1 47871\n",
"2 17224\n",
"3 22481\n",
"4 8520\n",
"dtype: int64"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# size of each segment\n",
"\n",
"X_test.groupby(\"quartile\").size()"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "008a0040-8a27-4fd8-8dfa-46d39d6b88d9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" <th>has_purchased</th>\n",
" </tr>\n",
" <tr>\n",
" <th>quartile</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.017380</td>\n",
" <td>0.008586</td>\n",
" <td>0.475141</td>\n",
" <td>0.008439</td>\n",
" <td>0.001358</td>\n",
" <td>549.044552</td>\n",
" <td>549.044465</td>\n",
" <td>0.003071</td>\n",
" <td>1.000000</td>\n",
" <td>0.562157</td>\n",
" <td>0.232416</td>\n",
" <td>0.416536</td>\n",
" <td>11.860521</td>\n",
" <td>1.648430</td>\n",
" <td>0.169233</td>\n",
" <td>0.026780</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.085810</td>\n",
" <td>0.880283</td>\n",
" <td>49.701732</td>\n",
" <td>0.742336</td>\n",
" <td>0.420866</td>\n",
" <td>381.428495</td>\n",
" <td>379.188470</td>\n",
" <td>1.044473</td>\n",
" <td>0.998374</td>\n",
" <td>0.507083</td>\n",
" <td>0.264515</td>\n",
" <td>0.596435</td>\n",
" <td>14.593184</td>\n",
" <td>3.725732</td>\n",
" <td>0.360811</td>\n",
" <td>0.117452</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.118100</td>\n",
" <td>1.478893</td>\n",
" <td>88.811284</td>\n",
" <td>1.003292</td>\n",
" <td>0.703349</td>\n",
" <td>198.284116</td>\n",
" <td>184.197970</td>\n",
" <td>1.879098</td>\n",
" <td>0.988123</td>\n",
" <td>0.051777</td>\n",
" <td>0.264001</td>\n",
" <td>0.526534</td>\n",
" <td>9.773898</td>\n",
" <td>2.978115</td>\n",
" <td>0.626785</td>\n",
" <td>0.209332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>46.046362</td>\n",
" <td>11.842254</td>\n",
" <td>2002.607230</td>\n",
" <td>1.508685</td>\n",
" <td>0.743192</td>\n",
" <td>386.401662</td>\n",
" <td>85.808238</td>\n",
" <td>12.894131</td>\n",
" <td>0.971479</td>\n",
" <td>0.130751</td>\n",
" <td>0.198239</td>\n",
" <td>0.622418</td>\n",
" <td>20.928286</td>\n",
" <td>8.367723</td>\n",
" <td>0.902055</td>\n",
" <td>0.666549</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"quartile \n",
"1 0.017380 0.008586 0.475141 0.008439 \n",
"2 2.085810 0.880283 49.701732 0.742336 \n",
"3 3.118100 1.478893 88.811284 1.003292 \n",
"4 46.046362 11.842254 2002.607230 1.508685 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"quartile \n",
"1 0.001358 549.044552 549.044465 \n",
"2 0.420866 381.428495 379.188470 \n",
"3 0.703349 198.284116 184.197970 \n",
"4 0.743192 386.401662 85.808238 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"quartile \n",
"1 0.003071 1.000000 0.562157 0.232416 \n",
"2 1.044473 0.998374 0.507083 0.264515 \n",
"3 1.879098 0.988123 0.051777 0.264001 \n",
"4 12.894131 0.971479 0.130751 0.198239 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened score \\\n",
"quartile \n",
"1 0.416536 11.860521 1.648430 0.169233 \n",
"2 0.596435 14.593184 3.725732 0.360811 \n",
"3 0.526534 9.773898 2.978115 0.626785 \n",
"4 0.622418 20.928286 8.367723 0.902055 \n",
"\n",
" has_purchased \n",
"quartile \n",
"1 0.026780 \n",
"2 0.117452 \n",
"3 0.209332 \n",
"4 0.666549 "
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check consistency of quartiles (we have an upward bias, which is explained by the fact that we want a decent recall)\n",
"\n",
"X_test[\"has_purchased\"] = y_test\n",
"X_test.groupby(\"quartile\").mean()"
]
},
{
"cell_type": "markdown",
"id": "e6bcaff3-0f47-46da-8873-a321d3382e63",
"metadata": {},
"source": [
"Méthode \\\n",
"On étudie le rythme d'achat des clients et on suppose qu'il sera le même dans le futur"
]
},
{
"cell_type": "code",
"execution_count": 106,
"id": "04218519-bffa-4340-87dc-e11332977067",
"metadata": {},
"outputs": [],
"source": [
"# purchasing pace by segment\n",
"\n",
"X_test[\"consumption_lifetime\"] = X_test[\"purchase_date_min\"] - X_test[\"purchase_date_max\"]"
]
},
{
"cell_type": "code",
"execution_count": 113,
"id": "4ac3610d-8a22-4135-a127-328812c5198c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 30.347912\n",
"std 95.435372\n",
"min 0.000000\n",
"25% 0.000000\n",
"50% 0.000000\n",
"75% 0.000000\n",
"max 547.122986\n",
"Name: consumption_lifetime, dtype: float64"
]
},
"execution_count": 113,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[\"consumption_lifetime\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 149,
"id": "ee86cfb4-e2c4-4485-b27a-ecaec159a0b9",
"metadata": {},
"outputs": [],
"source": [
"X_test[\"avg_purchase_delay\"] = (X_test[\"consumption_lifetime\"]/X_test[\"nb_purchases\"]).replace([np.inf, -np.inf], 0)"
]
},
{
"cell_type": "raw",
"id": "a2de6e96-4c92-42b2-8569-1c0f920e7a8c",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 150,
"id": "256a684d-0117-4daa-ba38-ff48ac946798",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>has_purchased</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.911844</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>363.061678</td>\n",
" <td>181.530839</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>165.00</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>266.012106</td>\n",
" <td>258.012106</td>\n",
" <td>3.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.581348</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>8.000000</td>\n",
" <td>2.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>23.0</td>\n",
" <td>14.0</td>\n",
" <td>600.00</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>453.423519</td>\n",
" <td>15.225949</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.970052</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>438.197569</td>\n",
" <td>31.299826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>21.0</td>\n",
" <td>3.0</td>\n",
" <td>1075.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>431.817072</td>\n",
" <td>230.432350</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.599176</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>201.384722</td>\n",
" <td>67.128241</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>140.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>277.254745</td>\n",
" <td>12.438877</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.707939</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>264.815868</td>\n",
" <td>132.407934</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95943</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>545.673137</td>\n",
" <td>362.284745</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>46.0</td>\n",
" <td>9.0</td>\n",
" <td>0.707163</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>183.388391</td>\n",
" <td>91.694196</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95989</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>77.31</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>271.676632</td>\n",
" <td>6.289577</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>21.0</td>\n",
" <td>6.0</td>\n",
" <td>0.937049</td>\n",
" <td>4</td>\n",
" <td>0.0</td>\n",
" <td>265.387055</td>\n",
" <td>132.693527</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95996</th>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>273.76</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>202.691389</td>\n",
" <td>7.142274</td>\n",
" <td>2.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>25.0</td>\n",
" <td>3.0</td>\n",
" <td>0.916446</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>195.549115</td>\n",
" <td>65.183038</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96043</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>136.42</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>363.119815</td>\n",
" <td>173.225752</td>\n",
" <td>2.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>35.0</td>\n",
" <td>29.0</td>\n",
" <td>0.866743</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>189.894062</td>\n",
" <td>94.947031</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96079</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>355.131933</td>\n",
" <td>355.128542</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>50.0</td>\n",
" <td>42.0</td>\n",
" <td>0.720674</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.003391</td>\n",
" <td>0.001696</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>14734 rows × 19 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"4 34.0 2.0 416.00 1.0 \n",
"9 3.0 3.0 165.00 1.0 \n",
"16 23.0 14.0 600.00 2.0 \n",
"21 21.0 3.0 1075.00 1.0 \n",
"22 4.0 2.0 140.00 1.0 \n",
"... ... ... ... ... \n",
"95943 2.0 2.0 0.00 1.0 \n",
"95989 2.0 2.0 77.31 2.0 \n",
"95996 3.0 3.0 273.76 2.0 \n",
"96043 2.0 2.0 136.42 1.0 \n",
"96079 2.0 2.0 0.00 1.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"4 0.0 478.693148 115.631470 \n",
"9 1.0 266.012106 258.012106 \n",
"16 1.0 453.423519 15.225949 \n",
"21 0.0 431.817072 230.432350 \n",
"22 0.0 277.254745 12.438877 \n",
"... ... ... ... \n",
"95943 0.0 545.673137 362.284745 \n",
"95989 1.0 271.676632 6.289577 \n",
"95996 1.0 202.691389 7.142274 \n",
"96043 1.0 363.119815 173.225752 \n",
"96079 0.0 355.131933 355.128542 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female gender_male \\\n",
"4 0.0 True False 1 0 \n",
"9 3.0 True False 0 0 \n",
"16 1.0 True False 0 1 \n",
"21 0.0 True True 0 1 \n",
"22 0.0 True True 0 1 \n",
"... ... ... ... ... ... \n",
"95943 0.0 True False 1 0 \n",
"95989 1.0 True False 1 0 \n",
"95996 2.0 True False 0 1 \n",
"96043 2.0 True False 0 1 \n",
"96079 0.0 True False 1 0 \n",
"\n",
" nb_campaigns nb_campaigns_opened score quartile has_purchased \\\n",
"4 0.0 0.0 0.911844 4 1.0 \n",
"9 0.0 0.0 0.581348 3 0.0 \n",
"16 0.0 0.0 0.970052 4 1.0 \n",
"21 0.0 0.0 0.599176 3 0.0 \n",
"22 0.0 0.0 0.707939 3 1.0 \n",
"... ... ... ... ... ... \n",
"95943 46.0 9.0 0.707163 3 0.0 \n",
"95989 21.0 6.0 0.937049 4 0.0 \n",
"95996 25.0 3.0 0.916446 4 1.0 \n",
"96043 35.0 29.0 0.866743 4 1.0 \n",
"96079 50.0 42.0 0.720674 3 0.0 \n",
"\n",
" consumption_lifetime avg_purchase_delay \n",
"4 363.061678 181.530839 \n",
"9 8.000000 2.666667 \n",
"16 438.197569 31.299826 \n",
"21 201.384722 67.128241 \n",
"22 264.815868 132.407934 \n",
"... ... ... \n",
"95943 183.388391 91.694196 \n",
"95989 265.387055 132.693527 \n",
"95996 195.549115 65.183038 \n",
"96043 189.894062 94.947031 \n",
"96079 0.003391 0.001696 \n",
"\n",
"[14734 rows x 19 columns]"
]
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[X_test[\"avg_purchase_delay\"]>0]"
]
},
{
"cell_type": "code",
"execution_count": 157,
"id": "55db2f02-37af-4809-a048-2528b7163f31",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_purchases</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" </tr>\n",
" <tr>\n",
" <th>quartile</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.000000</td>\n",
" <td>0.597093</td>\n",
" <td>0.298547</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.592668</td>\n",
" <td>26.192927</td>\n",
" <td>11.435486</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.203764</td>\n",
" <td>64.785322</td>\n",
" <td>25.490483</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>12.041836</td>\n",
" <td>306.126700</td>\n",
" <td>68.659817</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_purchases consumption_lifetime avg_purchase_delay\n",
"quartile \n",
"1 2.000000 0.597093 0.298547\n",
"2 2.592668 26.192927 11.435486\n",
"3 3.203764 64.785322 25.490483\n",
"4 12.041836 306.126700 68.659817"
]
},
"execution_count": 157,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[X_test[\"avg_purchase_delay\"]>0].groupby(\"quartile\")[[\"nb_purchases\", \"consumption_lifetime\", \"avg_purchase_delay\"]].mean()"
]
},
{
"cell_type": "code",
"execution_count": 156,
"id": "36c1d35d-3b51-4ddc-bcb7-a3ee2896167c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"nb_tickets 0\n",
"nb_purchases 0\n",
"total_amount 0\n",
"nb_suppliers 0\n",
"vente_internet_max 0\n",
"purchase_date_min 0\n",
"purchase_date_max 0\n",
"nb_tickets_internet 0\n",
"is_email_true 0\n",
"opt_in 0\n",
"gender_female 0\n",
"gender_male 0\n",
"nb_campaigns 0\n",
"nb_campaigns_opened 0\n",
"score 0\n",
"quartile 0\n",
"has_purchased 0\n",
"consumption_lifetime 0\n",
"avg_purchase_delay 0\n",
"dtype: int64"
]
},
"execution_count": 156,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[X_test[\"avg_purchase_delay\"]>0].isna().sum()"
]
},
{
"cell_type": "markdown",
"id": "1336c25b-1cf2-4041-b741-7c8c841fe1d2",
"metadata": {},
"source": [
"Etude du biais de surestimation"
]
},
{
"cell_type": "code",
"execution_count": 250,
"id": "9242f53b-1786-4a94-9d93-cb46d70d5fa6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 96096.000000\n",
"mean 3.362075\n",
"std 2.425080\n",
"min 1.000000\n",
"25% 1.000000\n",
"50% 2.000000\n",
"75% 5.000000\n",
"max 10.000000\n",
"Name: score, dtype: float64"
]
},
"execution_count": 250,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"((10 * X_test[\"score\"]).astype(int)).describe()"
]
},
{
"cell_type": "code",
"execution_count": 255,
"id": "22c2b1f6-0506-429e-af8c-3b1b5e05ff80",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
"1 6.319295\n",
"2 3.071979\n",
"3 2.994212\n",
"4 1.353321\n",
"dtype: float64"
]
},
"execution_count": 255,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# le biais de surestimation décroit avec le score \n",
"X_test.groupby(\"quartile\")[\"score\"].mean() / X_test.groupby(\"quartile\")[\"has_purchased\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 275,
"id": "ba363bf9-3169-4c89-a383-c2703436ff49",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>has_purchased</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" <th>decile</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.695913</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>1.294297</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.244205</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.279592</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>25.649026</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.696135</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>1.299103</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.911844</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>363.061678</td>\n",
" <td>181.530839</td>\n",
" <td>239.346574</td>\n",
" <td>10.678285</td>\n",
" <td>14.079210</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>5.0</td>\n",
" <td>0.584680</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>9.0</td>\n",
" <td>0.654520</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>3.0</td>\n",
" <td>0.116503</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>4.0</td>\n",
" <td>0.579827</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>4.0</td>\n",
" <td>0.254002</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... nb_campaigns_opened \\\n",
"0 0.0 True False ... 0.0 \n",
"1 0.0 True True ... 0.0 \n",
"2 0.0 True True ... 0.0 \n",
"3 0.0 True False ... 0.0 \n",
"4 0.0 True False ... 0.0 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 5.0 \n",
"96092 1.0 True False ... 9.0 \n",
"96093 0.0 True True ... 3.0 \n",
"96094 1.0 True False ... 4.0 \n",
"96095 0.0 True False ... 4.0 \n",
"\n",
" score quartile has_purchased consumption_lifetime \\\n",
"0 0.695913 3 0.0 0.000000 \n",
"1 0.244205 1 1.0 0.000000 \n",
"2 0.279592 2 0.0 0.000000 \n",
"3 0.696135 3 0.0 0.000000 \n",
"4 0.911844 4 1.0 363.061678 \n",
"... ... ... ... ... \n",
"96091 0.584680 3 1.0 0.000000 \n",
"96092 0.654520 3 0.0 0.000000 \n",
"96093 0.116503 1 0.0 0.000000 \n",
"96094 0.579827 3 0.0 0.000000 \n",
"96095 0.254002 2 0.0 0.000000 \n",
"\n",
" avg_purchase_delay avg_purchase_delay_all avg_tickets_delay \\\n",
"0 0.000000 5.177187 0.000000 \n",
"1 0.000000 426.265613 0.000000 \n",
"2 0.000000 436.033437 0.000000 \n",
"3 0.000000 5.196412 0.000000 \n",
"4 181.530839 239.346574 10.678285 \n",
"... ... ... ... \n",
"96091 0.000000 278.442257 0.000000 \n",
"96092 0.000000 189.207373 0.000000 \n",
"96093 NaN 0.000000 NaN \n",
"96094 0.000000 279.312905 0.000000 \n",
"96095 NaN 0.000000 NaN \n",
"\n",
" avg_tickets_delay_all decile \n",
"0 1.294297 6 \n",
"1 426.265613 2 \n",
"2 25.649026 2 \n",
"3 1.299103 6 \n",
"4 14.079210 9 \n",
"... ... ... \n",
"96091 278.442257 5 \n",
"96092 189.207373 6 \n",
"96093 0.000000 1 \n",
"96094 279.312905 5 \n",
"96095 0.000000 2 \n",
"\n",
"[96096 rows x 23 columns]"
]
},
"execution_count": 275,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# on prend un decoupage plus fin : deciles\n",
"\n",
"X_test[\"decile\"] = (10 * X_test[\"score\"]).astype(int)\n",
"X_test[\"decile\"] = X_test[\"decile\"].apply(lambda x : x-1 if x==10 else x)\n",
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 276,
"id": "b8db5044-74b1-423b-b12f-798606674bfe",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"decile\n",
"1 17.863019\n",
"2 3.826401\n",
"3 3.179880\n",
"4 3.392496\n",
"5 3.260982\n",
"6 3.294104\n",
"7 1.850487\n",
"8 1.489675\n",
"9 1.268598\n",
"dtype: float64"
]
},
"execution_count": 276,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test.groupby(\"decile\")[\"score\"].mean() / X_test.groupby(\"decile\")[\"has_purchased\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 277,
"id": "48a5b42e-fabf-44ae-ac88-fcb5a04d5d4f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.006422122322541649"
]
},
"execution_count": 277,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# pour les scores entre 0.1 et 0.2, la proba d'achat est de 0.6% elle est largement surestimée ici\n",
"X_test[X_test[\"decile\"]==1][\"has_purchased\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 284,
"id": "1091028b-0d07-4cfd-9081-696e289c29de",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>has_purchased</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" <th>decile</th>\n",
" <th>overshoot_coeff</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.695913</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>1.294297</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.244205</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.279592</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>25.649026</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.696135</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>1.299103</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.911844</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>363.061678</td>\n",
" <td>181.530839</td>\n",
" <td>239.346574</td>\n",
" <td>10.678285</td>\n",
" <td>14.079210</td>\n",
" <td>9</td>\n",
" <td>1.268598</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.584680</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.654520</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.116503</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>1</td>\n",
" <td>17.863019</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.579827</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.254002</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... score quartile \\\n",
"0 0.0 True False ... 0.695913 3 \n",
"1 0.0 True True ... 0.244205 1 \n",
"2 0.0 True True ... 0.279592 2 \n",
"3 0.0 True False ... 0.696135 3 \n",
"4 0.0 True False ... 0.911844 4 \n",
"... ... ... ... ... ... ... \n",
"96091 1.0 True False ... 0.584680 3 \n",
"96092 1.0 True False ... 0.654520 3 \n",
"96093 0.0 True True ... 0.116503 1 \n",
"96094 1.0 True False ... 0.579827 3 \n",
"96095 0.0 True False ... 0.254002 2 \n",
"\n",
" has_purchased consumption_lifetime avg_purchase_delay \\\n",
"0 0.0 0.000000 0.000000 \n",
"1 1.0 0.000000 0.000000 \n",
"2 0.0 0.000000 0.000000 \n",
"3 0.0 0.000000 0.000000 \n",
"4 1.0 363.061678 181.530839 \n",
"... ... ... ... \n",
"96091 1.0 0.000000 0.000000 \n",
"96092 0.0 0.000000 0.000000 \n",
"96093 0.0 0.000000 NaN \n",
"96094 0.0 0.000000 0.000000 \n",
"96095 0.0 0.000000 NaN \n",
"\n",
" avg_purchase_delay_all avg_tickets_delay avg_tickets_delay_all \\\n",
"0 5.177187 0.000000 1.294297 \n",
"1 426.265613 0.000000 426.265613 \n",
"2 436.033437 0.000000 25.649026 \n",
"3 5.196412 0.000000 1.299103 \n",
"4 239.346574 10.678285 14.079210 \n",
"... ... ... ... \n",
"96091 278.442257 0.000000 278.442257 \n",
"96092 189.207373 0.000000 189.207373 \n",
"96093 0.000000 NaN 0.000000 \n",
"96094 279.312905 0.000000 279.312905 \n",
"96095 0.000000 NaN 0.000000 \n",
"\n",
" decile overshoot_coeff \n",
"0 6 3.294104 \n",
"1 2 3.826401 \n",
"2 2 3.826401 \n",
"3 6 3.294104 \n",
"4 9 1.268598 \n",
"... ... ... \n",
"96091 5 3.260982 \n",
"96092 6 3.294104 \n",
"96093 1 17.863019 \n",
"96094 5 3.260982 \n",
"96095 2 3.826401 \n",
"\n",
"[96096 rows x 24 columns]"
]
},
"execution_count": 284,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create a variable to approximate the overestimation by decile\n",
"\n",
"# dictionnary mapping decile of the score and average overestimation\n",
"mapping_score_overshoot = dict(X_test.groupby(\"decile\")[\"score\"].mean() / X_test.groupby(\"decile\")[\"has_purchased\"].mean())\n",
"X_test[\"overshoot_coeff\"] = X_test[\"decile\"].map(mapping_score_overshoot)\n",
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 285,
"id": "4892d585-c80e-472c-b2bc-dc441255a36d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>quartile</th>\n",
" <th>has_purchased</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" <th>decile</th>\n",
" <th>overshoot_coeff</th>\n",
" <th>ajusted_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>1.294297</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211260</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.063821</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>25.649026</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.073069</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>1.299103</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>363.061678</td>\n",
" <td>181.530839</td>\n",
" <td>239.346574</td>\n",
" <td>10.678285</td>\n",
" <td>14.079210</td>\n",
" <td>9</td>\n",
" <td>1.268598</td>\n",
" <td>0.718781</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.179296</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.198694</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>1</td>\n",
" <td>17.863019</td>\n",
" <td>0.006522</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.177808</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.066382</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... quartile \\\n",
"0 0.0 True False ... 3 \n",
"1 0.0 True True ... 1 \n",
"2 0.0 True True ... 2 \n",
"3 0.0 True False ... 3 \n",
"4 0.0 True False ... 4 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 3 \n",
"96092 1.0 True False ... 3 \n",
"96093 0.0 True True ... 1 \n",
"96094 1.0 True False ... 3 \n",
"96095 0.0 True False ... 2 \n",
"\n",
" has_purchased consumption_lifetime avg_purchase_delay \\\n",
"0 0.0 0.000000 0.000000 \n",
"1 1.0 0.000000 0.000000 \n",
"2 0.0 0.000000 0.000000 \n",
"3 0.0 0.000000 0.000000 \n",
"4 1.0 363.061678 181.530839 \n",
"... ... ... ... \n",
"96091 1.0 0.000000 0.000000 \n",
"96092 0.0 0.000000 0.000000 \n",
"96093 0.0 0.000000 NaN \n",
"96094 0.0 0.000000 0.000000 \n",
"96095 0.0 0.000000 NaN \n",
"\n",
" avg_purchase_delay_all avg_tickets_delay avg_tickets_delay_all \\\n",
"0 5.177187 0.000000 1.294297 \n",
"1 426.265613 0.000000 426.265613 \n",
"2 436.033437 0.000000 25.649026 \n",
"3 5.196412 0.000000 1.299103 \n",
"4 239.346574 10.678285 14.079210 \n",
"... ... ... ... \n",
"96091 278.442257 0.000000 278.442257 \n",
"96092 189.207373 0.000000 189.207373 \n",
"96093 0.000000 NaN 0.000000 \n",
"96094 279.312905 0.000000 279.312905 \n",
"96095 0.000000 NaN 0.000000 \n",
"\n",
" decile overshoot_coeff ajusted_score \n",
"0 6 3.294104 0.211260 \n",
"1 2 3.826401 0.063821 \n",
"2 2 3.826401 0.073069 \n",
"3 6 3.294104 0.211328 \n",
"4 9 1.268598 0.718781 \n",
"... ... ... ... \n",
"96091 5 3.260982 0.179296 \n",
"96092 6 3.294104 0.198694 \n",
"96093 1 17.863019 0.006522 \n",
"96094 5 3.260982 0.177808 \n",
"96095 2 3.826401 0.066382 \n",
"\n",
"[96096 rows x 25 columns]"
]
},
"execution_count": 285,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[\"ajusted_score\"] = X_test[\"score\"]/X_test[\"overshoot_coeff\"]\n",
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 788,
"id": "8332e5c3-32ee-4492-91ee-0e49a15f94a1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE for score : 0.15637498623391197\n",
"MSE for adjusted score : 0.08877832832116543\n"
]
}
],
"source": [
"# difference between proba estimated and y has purchased\n",
"# the calibration allows to half the MSE\n",
"\n",
"MSE_score = ((X_test[\"score\"]-X_test[\"has_purchased\"])**2).mean()\n",
"MSE_ajusted_score = ((X_test[\"score_adjusted\"]-X_test[\"has_purchased\"])**2).mean()\n",
"print(f\"MSE for score : {MSE_score}\")\n",
"print(f\"MSE for adjusted score : {MSE_ajusted_score}\")"
]
},
{
"cell_type": "code",
"execution_count": 787,
"id": "89b41b80-c12a-46be-a7d1-59f4f63482e3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MAE for score : 0.32574831037767815\n",
"MAE for adjusted score : 0.17556035724742763\n"
]
}
],
"source": [
"# mean absolute error - divided by 2 with out method\n",
"\n",
"MAE_score = abs(X_test[\"score\"]-X_test[\"has_purchased\"]).mean()\n",
"MAE_ajusted_score = abs(X_test[\"score_adjusted\"]-X_test[\"has_purchased\"]).mean()\n",
"print(f\"MAE for score : {MAE_score}\")\n",
"print(f\"MAE for adjusted score : {MAE_ajusted_score}\")"
]
},
{
"cell_type": "markdown",
"id": "15f49d36-da8c-4c08-977e-8de4e438ed61",
"metadata": {},
"source": [
"New method to adjust - best way to fit the logit model"
]
},
{
"cell_type": "code",
"execution_count": 317,
"id": "9e2e1f4c-d9dc-495a-9604-4009f1e4c53f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"somme des scores : 36092.22480054577\n",
"nombre d'achats : y_has_purchased 13690.0\n",
"dtype: float64\n"
]
}
],
"source": [
"# au global, la prbabilité d'achat est largement surestimée, il ft corriger\n",
"print(\"somme des scores :\", X_test[\"score\"].sum())\n",
"print(\"nombre d'achats : \", y_test.sum())"
]
},
{
"cell_type": "code",
"execution_count": 311,
"id": "1573b9fd-c1be-4f9e-94a5-471ad6cb0726",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"36092.22480054577"
]
},
"execution_count": 311,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 1. calcul du biais\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 375,
"id": "5d6d5101-95ce-4137-8349-0e3c6321bc84",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" <th>decile</th>\n",
" <th>overshoot_coeff</th>\n",
" <th>ajusted_score</th>\n",
" <th>odd_ratio</th>\n",
" <th>test_adjusted_score_2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>1.294297</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211260</td>\n",
" <td>2.288530</td>\n",
" <td>0.533640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.063821</td>\n",
" <td>0.323109</td>\n",
" <td>0.139085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>25.649026</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.073069</td>\n",
" <td>0.388102</td>\n",
" <td>0.162515</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>1.299103</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211328</td>\n",
" <td>2.290940</td>\n",
" <td>0.533902</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>363.061678</td>\n",
" <td>181.530839</td>\n",
" <td>239.346574</td>\n",
" <td>10.678285</td>\n",
" <td>14.079210</td>\n",
" <td>9</td>\n",
" <td>1.268598</td>\n",
" <td>0.718781</td>\n",
" <td>10.343538</td>\n",
" <td>0.837972</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.179296</td>\n",
" <td>1.407779</td>\n",
" <td>0.413108</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.198694</td>\n",
" <td>1.894523</td>\n",
" <td>0.486458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>1</td>\n",
" <td>17.863019</td>\n",
" <td>0.006522</td>\n",
" <td>0.131865</td>\n",
" <td>0.061854</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.177808</td>\n",
" <td>1.379973</td>\n",
" <td>0.408279</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.066382</td>\n",
" <td>0.340487</td>\n",
" <td>0.145477</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 27 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... consumption_lifetime \\\n",
"0 0.0 True False ... 0.000000 \n",
"1 0.0 True True ... 0.000000 \n",
"2 0.0 True True ... 0.000000 \n",
"3 0.0 True False ... 0.000000 \n",
"4 0.0 True False ... 363.061678 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 0.000000 \n",
"96092 1.0 True False ... 0.000000 \n",
"96093 0.0 True True ... 0.000000 \n",
"96094 1.0 True False ... 0.000000 \n",
"96095 0.0 True False ... 0.000000 \n",
"\n",
" avg_purchase_delay avg_purchase_delay_all avg_tickets_delay \\\n",
"0 0.000000 5.177187 0.000000 \n",
"1 0.000000 426.265613 0.000000 \n",
"2 0.000000 436.033437 0.000000 \n",
"3 0.000000 5.196412 0.000000 \n",
"4 181.530839 239.346574 10.678285 \n",
"... ... ... ... \n",
"96091 0.000000 278.442257 0.000000 \n",
"96092 0.000000 189.207373 0.000000 \n",
"96093 NaN 0.000000 NaN \n",
"96094 0.000000 279.312905 0.000000 \n",
"96095 NaN 0.000000 NaN \n",
"\n",
" avg_tickets_delay_all decile overshoot_coeff ajusted_score \\\n",
"0 1.294297 6 3.294104 0.211260 \n",
"1 426.265613 2 3.826401 0.063821 \n",
"2 25.649026 2 3.826401 0.073069 \n",
"3 1.299103 6 3.294104 0.211328 \n",
"4 14.079210 9 1.268598 0.718781 \n",
"... ... ... ... ... \n",
"96091 278.442257 5 3.260982 0.179296 \n",
"96092 189.207373 6 3.294104 0.198694 \n",
"96093 0.000000 1 17.863019 0.006522 \n",
"96094 279.312905 5 3.260982 0.177808 \n",
"96095 0.000000 2 3.826401 0.066382 \n",
"\n",
" odd_ratio test_adjusted_score_2 \n",
"0 2.288530 0.533640 \n",
"1 0.323109 0.139085 \n",
"2 0.388102 0.162515 \n",
"3 2.290940 0.533902 \n",
"4 10.343538 0.837972 \n",
"... ... ... \n",
"96091 1.407779 0.413108 \n",
"96092 1.894523 0.486458 \n",
"96093 0.131865 0.061854 \n",
"96094 1.379973 0.408279 \n",
"96095 0.340487 0.145477 \n",
"\n",
"[96096 rows x 27 columns]"
]
},
"execution_count": 375,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# premier problème : certains scores valent 1, ce qui empeche de calculer un odd ratio \n",
"# on remplace les scores de 1 par 0.999\n",
"\n",
"X_test[\"score\"] = X_test[\"score\"].apply(lambda x : 0.9999999999999996 if x==1 else x)\n",
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 377,
"id": "8a29f835-8e4f-45e9-9c91-e019f56fee5e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" <th>decile</th>\n",
" <th>overshoot_coeff</th>\n",
" <th>ajusted_score</th>\n",
" <th>odd_ratio</th>\n",
" <th>test_adjusted_score_2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>1.294297</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211260</td>\n",
" <td>2.288530</td>\n",
" <td>0.533640</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.063821</td>\n",
" <td>0.323109</td>\n",
" <td>0.139085</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>25.649026</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.073069</td>\n",
" <td>0.388102</td>\n",
" <td>0.162515</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>1.299103</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211328</td>\n",
" <td>2.290940</td>\n",
" <td>0.533902</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>363.061678</td>\n",
" <td>181.530839</td>\n",
" <td>239.346574</td>\n",
" <td>10.678285</td>\n",
" <td>14.079210</td>\n",
" <td>9</td>\n",
" <td>1.268598</td>\n",
" <td>0.718781</td>\n",
" <td>10.343538</td>\n",
" <td>0.837972</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.179296</td>\n",
" <td>1.407779</td>\n",
" <td>0.413108</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.198694</td>\n",
" <td>1.894523</td>\n",
" <td>0.486458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>1</td>\n",
" <td>17.863019</td>\n",
" <td>0.006522</td>\n",
" <td>0.131865</td>\n",
" <td>0.061854</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.177808</td>\n",
" <td>1.379973</td>\n",
" <td>0.408279</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.066382</td>\n",
" <td>0.340487</td>\n",
" <td>0.145477</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 27 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... consumption_lifetime \\\n",
"0 0.0 True False ... 0.000000 \n",
"1 0.0 True True ... 0.000000 \n",
"2 0.0 True True ... 0.000000 \n",
"3 0.0 True False ... 0.000000 \n",
"4 0.0 True False ... 363.061678 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 0.000000 \n",
"96092 1.0 True False ... 0.000000 \n",
"96093 0.0 True True ... 0.000000 \n",
"96094 1.0 True False ... 0.000000 \n",
"96095 0.0 True False ... 0.000000 \n",
"\n",
" avg_purchase_delay avg_purchase_delay_all avg_tickets_delay \\\n",
"0 0.000000 5.177187 0.000000 \n",
"1 0.000000 426.265613 0.000000 \n",
"2 0.000000 436.033437 0.000000 \n",
"3 0.000000 5.196412 0.000000 \n",
"4 181.530839 239.346574 10.678285 \n",
"... ... ... ... \n",
"96091 0.000000 278.442257 0.000000 \n",
"96092 0.000000 189.207373 0.000000 \n",
"96093 NaN 0.000000 NaN \n",
"96094 0.000000 279.312905 0.000000 \n",
"96095 NaN 0.000000 NaN \n",
"\n",
" avg_tickets_delay_all decile overshoot_coeff ajusted_score \\\n",
"0 1.294297 6 3.294104 0.211260 \n",
"1 426.265613 2 3.826401 0.063821 \n",
"2 25.649026 2 3.826401 0.073069 \n",
"3 1.299103 6 3.294104 0.211328 \n",
"4 14.079210 9 1.268598 0.718781 \n",
"... ... ... ... ... \n",
"96091 278.442257 5 3.260982 0.179296 \n",
"96092 189.207373 6 3.294104 0.198694 \n",
"96093 0.000000 1 17.863019 0.006522 \n",
"96094 279.312905 5 3.260982 0.177808 \n",
"96095 0.000000 2 3.826401 0.066382 \n",
"\n",
" odd_ratio test_adjusted_score_2 \n",
"0 2.288530 0.533640 \n",
"1 0.323109 0.139085 \n",
"2 0.388102 0.162515 \n",
"3 2.290940 0.533902 \n",
"4 10.343538 0.837972 \n",
"... ... ... \n",
"96091 1.407779 0.413108 \n",
"96092 1.894523 0.486458 \n",
"96093 0.131865 0.061854 \n",
"96094 1.379973 0.408279 \n",
"96095 0.340487 0.145477 \n",
"\n",
"[96096 rows x 27 columns]"
]
},
"execution_count": 377,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[\"odd_ratio\"] = X_test[\"score\"]/(1-X_test[\"score\"])\n",
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 378,
"id": "b5971afb-a6ef-4433-9cee-13ea978b22c8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 9.609600e+04\n",
"mean 2.117164e+11\n",
"std 2.179173e+13\n",
"min 1.207494e-01\n",
"25% 1.476621e-01\n",
"50% 3.337214e-01\n",
"75% 1.430245e+00\n",
"max 2.251800e+15\n",
"Name: odd_ratio, dtype: float64"
]
},
"execution_count": 378,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[\"odd_ratio\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 381,
"id": "e878a711-5d7d-455f-9e0f-da50961568d9",
"metadata": {},
"outputs": [],
"source": [
"def adjusted_score(odd_ratio, bias) :\n",
" adjusted_score = odd_ratio/(bias+odd_ratio)\n",
" return adjusted_score"
]
},
{
"cell_type": "code",
"execution_count": 424,
"id": "bff25885-1191-432a-976c-4b466dbc0ac7",
"metadata": {},
"outputs": [],
"source": [
"def obj_function(bias) :\n",
" obj = sum([adjusted_score(element, bias) for element in X_test[\"odd_ratio\"]]) # - y_test.sum()[\"y_has_purchased\"]\n",
" return obj"
]
},
{
"cell_type": "code",
"execution_count": 380,
"id": "a9df55fc-e1c6-4462-9fa5-248d47f4957f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13690.0"
]
},
"execution_count": 380,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test.sum()[\"y_has_purchased\"]"
]
},
{
"cell_type": "code",
"execution_count": 396,
"id": "ecae3be2-ddf4-4a76-940d-403a176fa8f5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13749.42306555955"
]
},
"execution_count": 396,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# le biais optimal se trouve aux alentours de 6\n",
"sum([adjusted_score(element, 6) for element in X_test[\"odd_ratio\"]])"
]
},
{
"cell_type": "code",
"execution_count": 411,
"id": "5698b75b-759a-4cc5-8466-c513d2ae2aa2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"36092.2248005385"
]
},
"execution_count": 411,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum([adjusted_score(element, 1) for element in X_test[\"odd_ratio\"]])"
]
},
{
"cell_type": "code",
"execution_count": 412,
"id": "42840b8b-0314-4b15-afb9-09a9e550a729",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"13690.0"
]
},
"execution_count": 412,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test.sum()[\"y_has_purchased\"]"
]
},
{
"cell_type": "code",
"execution_count": 425,
"id": "8a61a53c-c98b-4c76-bcfe-a4bb0f3db42a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"36092.2248005385"
]
},
"execution_count": 425,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"obj_function(1)"
]
},
{
"cell_type": "code",
"execution_count": 423,
"id": "d29623ca-c9f7-4ef7-b5ea-45b2d2f65096",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3.9020966429798136"
]
},
"execution_count": 423,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# on devrait trouver un résultat autour de 6.04\n",
"sum([adjusted_score(element, 6.04) for element in X_test[\"odd_ratio\"]]) - y_test.sum()[\"y_has_purchased\"]"
]
},
{
"cell_type": "code",
"execution_count": 426,
"id": "6417f2a2-9e22-40c7-8297-2ed0b72e9b1d",
"metadata": {},
"outputs": [],
"source": [
"# minimization\n",
"\n",
"from scipy.optimize import minimize\n",
"\n",
"\n",
"y_sum = y_test.sum()[\"y_has_purchased\"]\n",
"initial_guess = 6\n",
"estimated_biais = minimize(lambda bias : (obj_function(bias)-y_sum)**2 ,\n",
"initial_guess , method = \"BFGS\")"
]
},
{
"cell_type": "code",
"execution_count": 430,
"id": "937606df-1730-43b6-9a95-7c626aa7a3c5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bias estimated : 6.042826489667565\n"
]
}
],
"source": [
"print(f\"bias estimated : {estimated_biais.x[0]}\")"
]
},
{
"cell_type": "code",
"execution_count": 435,
"id": "ad6ebcee-f1f6-46fc-8d9a-008762acae28",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" <th>decile</th>\n",
" <th>overshoot_coeff</th>\n",
" <th>ajusted_score</th>\n",
" <th>odd_ratio</th>\n",
" <th>test_adjusted_score_2</th>\n",
" <th>score_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>1.294297</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211260</td>\n",
" <td>2.288530</td>\n",
" <td>0.533640</td>\n",
" <td>0.274689</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.063821</td>\n",
" <td>0.323109</td>\n",
" <td>0.139085</td>\n",
" <td>0.050756</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>25.649026</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.073069</td>\n",
" <td>0.388102</td>\n",
" <td>0.162515</td>\n",
" <td>0.060349</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>1.299103</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211328</td>\n",
" <td>2.290940</td>\n",
" <td>0.533902</td>\n",
" <td>0.274899</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>181.530839</td>\n",
" <td>239.346574</td>\n",
" <td>10.678285</td>\n",
" <td>14.079210</td>\n",
" <td>9</td>\n",
" <td>1.268598</td>\n",
" <td>0.718781</td>\n",
" <td>10.343538</td>\n",
" <td>0.837972</td>\n",
" <td>0.631228</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.179296</td>\n",
" <td>1.407779</td>\n",
" <td>0.413108</td>\n",
" <td>0.188948</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.198694</td>\n",
" <td>1.894523</td>\n",
" <td>0.486458</td>\n",
" <td>0.238685</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>1</td>\n",
" <td>17.863019</td>\n",
" <td>0.006522</td>\n",
" <td>0.131865</td>\n",
" <td>0.061854</td>\n",
" <td>0.021356</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.177808</td>\n",
" <td>1.379973</td>\n",
" <td>0.408279</td>\n",
" <td>0.185910</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.066382</td>\n",
" <td>0.340487</td>\n",
" <td>0.145477</td>\n",
" <td>0.053340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 28 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... avg_purchase_delay \\\n",
"0 0.0 True False ... 0.000000 \n",
"1 0.0 True True ... 0.000000 \n",
"2 0.0 True True ... 0.000000 \n",
"3 0.0 True False ... 0.000000 \n",
"4 0.0 True False ... 181.530839 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 0.000000 \n",
"96092 1.0 True False ... 0.000000 \n",
"96093 0.0 True True ... NaN \n",
"96094 1.0 True False ... 0.000000 \n",
"96095 0.0 True False ... NaN \n",
"\n",
" avg_purchase_delay_all avg_tickets_delay avg_tickets_delay_all \\\n",
"0 5.177187 0.000000 1.294297 \n",
"1 426.265613 0.000000 426.265613 \n",
"2 436.033437 0.000000 25.649026 \n",
"3 5.196412 0.000000 1.299103 \n",
"4 239.346574 10.678285 14.079210 \n",
"... ... ... ... \n",
"96091 278.442257 0.000000 278.442257 \n",
"96092 189.207373 0.000000 189.207373 \n",
"96093 0.000000 NaN 0.000000 \n",
"96094 279.312905 0.000000 279.312905 \n",
"96095 0.000000 NaN 0.000000 \n",
"\n",
" decile overshoot_coeff ajusted_score odd_ratio \\\n",
"0 6 3.294104 0.211260 2.288530 \n",
"1 2 3.826401 0.063821 0.323109 \n",
"2 2 3.826401 0.073069 0.388102 \n",
"3 6 3.294104 0.211328 2.290940 \n",
"4 9 1.268598 0.718781 10.343538 \n",
"... ... ... ... ... \n",
"96091 5 3.260982 0.179296 1.407779 \n",
"96092 6 3.294104 0.198694 1.894523 \n",
"96093 1 17.863019 0.006522 0.131865 \n",
"96094 5 3.260982 0.177808 1.379973 \n",
"96095 2 3.826401 0.066382 0.340487 \n",
"\n",
" test_adjusted_score_2 score_adjusted \n",
"0 0.533640 0.274689 \n",
"1 0.139085 0.050756 \n",
"2 0.162515 0.060349 \n",
"3 0.533902 0.274899 \n",
"4 0.837972 0.631228 \n",
"... ... ... \n",
"96091 0.413108 0.188948 \n",
"96092 0.486458 0.238685 \n",
"96093 0.061854 0.021356 \n",
"96094 0.408279 0.185910 \n",
"96095 0.145477 0.053340 \n",
"\n",
"[96096 rows x 28 columns]"
]
},
"execution_count": 435,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# on corrige les scores trouvés en fonction\n",
"\n",
"X_test[\"score_adjusted\"] = adjusted_score(X_test[\"odd_ratio\"], bias=estimated_biais.x[0])\n",
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 436,
"id": "2934bfff-23ac-4c4e-8fe6-2087afac1e0f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" <th>decile</th>\n",
" <th>overshoot_coeff</th>\n",
" <th>ajusted_score</th>\n",
" <th>odd_ratio</th>\n",
" <th>test_adjusted_score_2</th>\n",
" <th>score_adjusted</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>1.294297</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211260</td>\n",
" <td>2.288530</td>\n",
" <td>0.533640</td>\n",
" <td>0.274689</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.063821</td>\n",
" <td>0.323109</td>\n",
" <td>0.139085</td>\n",
" <td>0.050756</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>25.649026</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.073069</td>\n",
" <td>0.388102</td>\n",
" <td>0.162515</td>\n",
" <td>0.060349</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>1.299103</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211328</td>\n",
" <td>2.290940</td>\n",
" <td>0.533902</td>\n",
" <td>0.274899</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>181.530839</td>\n",
" <td>239.346574</td>\n",
" <td>10.678285</td>\n",
" <td>14.079210</td>\n",
" <td>9</td>\n",
" <td>1.268598</td>\n",
" <td>0.718781</td>\n",
" <td>10.343538</td>\n",
" <td>0.837972</td>\n",
" <td>0.631228</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.179296</td>\n",
" <td>1.407779</td>\n",
" <td>0.413108</td>\n",
" <td>0.188948</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.198694</td>\n",
" <td>1.894523</td>\n",
" <td>0.486458</td>\n",
" <td>0.238685</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>1</td>\n",
" <td>17.863019</td>\n",
" <td>0.006522</td>\n",
" <td>0.131865</td>\n",
" <td>0.061854</td>\n",
" <td>0.021356</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.177808</td>\n",
" <td>1.379973</td>\n",
" <td>0.408279</td>\n",
" <td>0.185910</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.066382</td>\n",
" <td>0.340487</td>\n",
" <td>0.145477</td>\n",
" <td>0.053340</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 28 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... avg_purchase_delay \\\n",
"0 0.0 True False ... 0.000000 \n",
"1 0.0 True True ... 0.000000 \n",
"2 0.0 True True ... 0.000000 \n",
"3 0.0 True False ... 0.000000 \n",
"4 0.0 True False ... 181.530839 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 0.000000 \n",
"96092 1.0 True False ... 0.000000 \n",
"96093 0.0 True True ... NaN \n",
"96094 1.0 True False ... 0.000000 \n",
"96095 0.0 True False ... NaN \n",
"\n",
" avg_purchase_delay_all avg_tickets_delay avg_tickets_delay_all \\\n",
"0 5.177187 0.000000 1.294297 \n",
"1 426.265613 0.000000 426.265613 \n",
"2 436.033437 0.000000 25.649026 \n",
"3 5.196412 0.000000 1.299103 \n",
"4 239.346574 10.678285 14.079210 \n",
"... ... ... ... \n",
"96091 278.442257 0.000000 278.442257 \n",
"96092 189.207373 0.000000 189.207373 \n",
"96093 0.000000 NaN 0.000000 \n",
"96094 279.312905 0.000000 279.312905 \n",
"96095 0.000000 NaN 0.000000 \n",
"\n",
" decile overshoot_coeff ajusted_score odd_ratio \\\n",
"0 6 3.294104 0.211260 2.288530 \n",
"1 2 3.826401 0.063821 0.323109 \n",
"2 2 3.826401 0.073069 0.388102 \n",
"3 6 3.294104 0.211328 2.290940 \n",
"4 9 1.268598 0.718781 10.343538 \n",
"... ... ... ... ... \n",
"96091 5 3.260982 0.179296 1.407779 \n",
"96092 6 3.294104 0.198694 1.894523 \n",
"96093 1 17.863019 0.006522 0.131865 \n",
"96094 5 3.260982 0.177808 1.379973 \n",
"96095 2 3.826401 0.066382 0.340487 \n",
"\n",
" test_adjusted_score_2 score_adjusted \n",
"0 0.533640 0.274689 \n",
"1 0.139085 0.050756 \n",
"2 0.162515 0.060349 \n",
"3 0.533902 0.274899 \n",
"4 0.837972 0.631228 \n",
"... ... ... \n",
"96091 0.413108 0.188948 \n",
"96092 0.486458 0.238685 \n",
"96093 0.061854 0.021356 \n",
"96094 0.408279 0.185910 \n",
"96095 0.145477 0.053340 \n",
"\n",
"[96096 rows x 28 columns]"
]
},
"execution_count": 436,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 549,
"id": "0dadc6f7-9c49-4188-9ae4-8b9c84770cf6",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjoAAAGdCAYAAAAbudkLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA4+ElEQVR4nO3df1xW9f3/8Sfy4xJILhUEvJJMP0OSsDJsiK60RNCPyKo1XfSh3JzaSImEWc71CfdpmL+dsfkxV9rHH9GWs/WZyqBWLPJnTJYKqRVL/QhiihdIBITn+0c3z7dLjLxQQY6P++123W67znmdc17nPep69r7OdY6HYRiGAAAALKhLRzcAAABwpRB0AACAZRF0AACAZRF0AACAZRF0AACAZRF0AACAZRF0AACAZRF0AACAZXl1dAMd6ezZszp27Ji6desmDw+Pjm4HAABcBMMwVFtbK4fDoS5dWp+zuaaDzrFjxxQWFtbRbQAAgDY4cuSI+vTp02rNNR10unXrJumrgQoICOjgbgAAwMWoqalRWFiY+Tnemms66Jz7uiogIICgAwBAJ3Mxl51wMTIAALAsgg4AALAsgg4AALCsa/oaHQDApTMMQ19++aWam5s7uhVYhKenp7y8vC7LrV8IOgCANmtsbFRFRYU+//zzjm4FFuPn56fevXvLx8fnkvZD0AEAtMnZs2dVXl4uT09PORwO+fj4cPNVXDLDMNTY2KgTJ06ovLxc4eHh33pTwNYQdAAAbdLY2KizZ88qLCxMfn5+Hd0OLMTX11fe3t769NNP1djYqK5du7Z5X1yMDAC4JJfyX9vAN7lcf1f8dQIAAMsi6AAAAMviGh0AwGW3tOBgux7vidED2vV46DyY0QEAAJZF0AEA4Appamrq6BaueQQdAMA157XXXtOgQYPk6+urwMBAxcXFqa6uTpL00ksv6eabb5bNZlPv3r01ffp0c7vDhw/r+9//vq677joFBARowoQJOn78uLk+KytLt912m1566SX1799fNptNhmHI6XRq6tSpCg4OVkBAgO655x7985//bPfzvhZxjc6V9Pa8ju7AfXfP7ugOAOCKqqio0IMPPqgFCxbovvvuU21trd59910ZhqEVK1Zo5syZeu655zR27Fg5nU699957kr66kd29994rf39/FRYW6ssvv1RqaqomTpyod955x9z/Rx99pD/84Q/auHGjPD09JUnjxo1Tz549tWXLFtntdq1cuVKjRo3SwYMH1bNnz44YhmsGQQcAcE2pqKjQl19+qfvvv199+/aVJA0aNEiS9OyzzyojI0OPP/64WX/HHXdIkt5880198MEHKi8vV1hYmCRp7dq1uvnmm7V7926zrrGxUWvXrlWvXr0kSX/729+0d+9eVVVVyWazSZIWLVqk119/Xa+99pqmTp3aPid+jSLoAACuKbfeeqtGjRqlQYMGKSEhQfHx8XrggQfU1NSkY8eOadSoURfcrqysTGFhYWbIkaTIyEh1795dZWVlZtDp27evGXIkqbi4WGfOnFFgYKDL/urr6/Xxxx9fgTPE17l9jc7//d//6T/+4z8UGBgoPz8/3XbbbSouLjbXG4ahrKwsORwO+fr6auTIkdq/f7/LPhoaGjRjxgwFBQXJ399fSUlJOnr0qEtNdXW1UlJSZLfbZbfblZKSotOnT7vUHD58WOPHj5e/v7+CgoKUlpamxsZGd08JAHAN8fT0VEFBgbZu3arIyEg9//zzioiIcLnW5kIMw7jgs7zOX+7v7++y/uzZs+rdu7dKSkpcXgcOHNDPf/7zy3NS+EZuBZ3q6moNHz5c3t7e2rp1q0pLS7V48WJ1797drFmwYIGWLFminJwc7d69W6GhoRo9erRqa2vNmvT0dG3atEm5ubkqKirSmTNnlJiYqObmZrMmOTlZJSUlysvLU15enkpKSpSSkmKub25u1rhx41RXV6eioiLl5uZq48aNysjIuIThAABcCzw8PDR8+HDNnTtXe/bskY+PjwoKCnTjjTfqrbfeuuA2kZGROnz4sI4cOWIuKy0tldPp1MCBA7/xWLfffrsqKyvl5eWl73znOy6voKCgy35ucOXWV1fz589XWFiYVq9ebS678cYbzf9tGIaWLVumOXPm6P7775ckvfzyywoJCdGGDRs0bdo0OZ1Ovfjii1q7dq3i4uIkSevWrVNYWJjefPNNJSQkqKysTHl5edqxY4diYmIkSatWrVJsbKwOHDigiIgI5efnq7S0VEeOHJHD4ZAkLV68WJMmTdKvf/1rBQQEXNLAAACsaefOnXrrrbcUHx+v4OBg7dy5UydOnNDAgQOVlZWlRx99VMHBwRo7dqxqa2v13nvvacaMGYqLi9Mtt9yihx56SMuWLTMvRh4xYoSGDBnyjceLi4tTbGys7r33Xs2fP18RERE6duyYtmzZonvvvbfVbXHp3Ao6b7zxhhISEvTDH/5QhYWFuv7665WamqopU6ZIksrLy1VZWan4+HhzG5vNphEjRmjbtm2aNm2aiouL1dTU5FLjcDgUFRWlbdu2KSEhQdu3b5fdbjdDjiQNHTpUdrtd27ZtU0REhLZv366oqCgz5EhSQkKCGhoaVFxcrLvvvrtF/w0NDWpoaDDf19TUuHP6AICLdDXfqTggIEB///vftWzZMtXU1Khv375avHixxo4dK0n64osvtHTpUmVmZiooKEgPPPCApK9mgV5//XXNmDFDd911l7p06aIxY8bo+eefb/V4Hh4e2rJli+bMmaOf/OQnOnHihEJDQ3XXXXcpJCTkip/vtc6toPPJJ5+YP737xS9+oV27diktLU02m00PP/ywKisrJanF/3EhISH69NNPJUmVlZXy8fFRjx49WtSc276yslLBwcEtjh8cHOxSc/5xevToIR8fH7PmfPPmzdPcuXPdOWUAgMUMHDhQeXl537h+2rRpmjZt2gXX3XDDDfrzn//8jdtmZWUpKyurxfJu3bpp+fLlWr58udv94tK4dY3O2bNndfvttys7O1uDBw/WtGnTNGXKFK1YscKl7vyLtb7pAq7Wai7mgq+Lqfm62bNny+l0mq+vf88KAACsx62g07t3b0VGRrosGzhwoA4fPixJCg0NlaQWMypVVVXm7EtoaKgaGxtVXV3das2Frn4/ceKES835x6murlZTU9M3TgXabDYFBAS4vAAAgHW5FXSGDx+uAwcOuCw7ePCgecOlfv36KTQ0VAUFBeb6xsZGFRYWatiwYZKk6OhoeXt7u9RUVFRo3759Zk1sbKycTqd27dpl1uzcuVNOp9OlZt++faqoqDBr8vPzZbPZFB0d7c5pAQAAi3LrGp0nnnhCw4YNU3Z2tiZMmKBdu3bphRde0AsvvCDpq6+S0tPTlZ2drfDwcIWHhys7O1t+fn5KTk6WJNntdk2ePFkZGRkKDAxUz549lZmZqUGDBpm/who4cKDGjBmjKVOmaOXKlZKkqVOnKjExUREREZKk+Ph4RUZGKiUlRQsXLtSpU6eUmZmpKVOmMFMDAAAkuRl07rjjDm3atEmzZ8/Wr371K/Xr10/Lli3TQw89ZNbMmjVL9fX1Sk1NVXV1tWJiYpSfn69u3bqZNUuXLpWXl5cmTJig+vp6jRo1SmvWrDGfCSJJ69evV1pamvnrrKSkJOXk5JjrPT09tXnzZqWmpmr48OHy9fVVcnKyFi1a1ObBAAAA1uJhGIbR0U10lJqaGtntdjmdziszC8RDPQFY2BdffKHy8nL169dPXbt27eh2YDGt/X258/nt9iMgAAAAOguCDgAAsCyCDgAAF+Ff//qXPDw8VFJScsH3V5M1a9a4PIfyWubWxcgAAFyU9r5GsQOuLwwLC1NFRcVlezDnmjVrlJ6ertOnT1+W/eErzOgAANAGnp6eCg0NlZcXcwbfprGxscOOTdABAFxz8vLy9L3vfU/du3dXYGCgEhMT9fHHH7vU7Nq1S4MHD1bXrl01ZMgQ7dmzx2X9+V9dXejrotdff93lsUT//Oc/dffdd6tbt24KCAhQdHS03n//fb3zzjv68Y9/LKfTKQ8PD3l4eJjPzGpsbNSsWbN0/fXXy9/
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# histogramme des probas et des probas ajustées\n",
"\n",
"plt.hist(X_test[\"score\"], label = \"score\", alpha=0.5)\n",
"plt.hist(X_test[\"score_adjusted\"], label=\"adjusted score\", alpha=0.5)\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 557,
"id": "646a8e9b-99dc-4e06-ab5a-42b21de6917b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.32260447885447885\n",
"0.06268731268731269\n",
"0.14246170496170496\n"
]
}
],
"source": [
"# on passe de 32% de scores supérieurs à 1/2 à 6%\n",
"\n",
"print((X_test[\"score\"]>0.5).mean())\n",
"print((X_test[\"score_adjusted\"]>0.5).mean())\n",
"print(y_test.mean()[\"y_has_purchased\"])"
]
},
{
"cell_type": "code",
"execution_count": 437,
"id": "3a60fa17-c960-4702-baa1-a7dc6cd227b0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"nombre de clients ayant acheté : 13690.0\n",
"somme des scores ajustés : 13690.000010280266\n"
]
}
],
"source": [
"# on vérifie que cette correction a permis d'avoir des résultats cohérents\n",
"\n",
"print(\"nombre de clients ayant acheté :\",y_sum)\n",
"print(\"somme des scores ajustés :\", X_test[\"score_adjusted\"].sum())"
]
},
{
"cell_type": "code",
"execution_count": 440,
"id": "3a7479a5-b6a3-47a2-8f78-4259746498f1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE for score : 0.15637498623391197\n",
"MSE for ajusted score : 0.08877832832116543\n"
]
}
],
"source": [
"# cet ajustement permet de plus de réduire drastiquement le MSE \n",
"\n",
"MSE_score = ((X_test[\"score\"]-X_test[\"has_purchased\"])**2).mean()\n",
"MSE_ajusted_score = ((X_test[\"score_adjusted\"]-X_test[\"has_purchased\"])**2).mean()\n",
"print(f\"MSE for score : {MSE_score}\")\n",
"print(f\"MSE for ajusted score : {MSE_ajusted_score}\")"
]
},
{
"cell_type": "code",
"execution_count": 518,
"id": "fd963072-26f7-4805-84db-5612a40dcafd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>score</th>\n",
" <th>score_adjusted</th>\n",
" <th>has_purchased</th>\n",
" </tr>\n",
" <tr>\n",
" <th>quartile</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.169233</td>\n",
" <td>0.033442</td>\n",
" <td>0.026780</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.360811</td>\n",
" <td>0.088246</td>\n",
" <td>0.117452</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.626785</td>\n",
" <td>0.222962</td>\n",
" <td>0.209332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.902055</td>\n",
" <td>0.652198</td>\n",
" <td>0.666549</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" score score_adjusted has_purchased\n",
"quartile \n",
"1 0.169233 0.033442 0.026780\n",
"2 0.360811 0.088246 0.117452\n",
"3 0.626785 0.222962 0.209332\n",
"4 0.902055 0.652198 0.666549"
]
},
"execution_count": 518,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# on est bcp plus proche des probas d'achat moyennes\n",
"X_test.groupby(\"quartile\")[[\"score\",\"score_adjusted\", \"has_purchased\"]].mean()"
]
},
{
"cell_type": "markdown",
"id": "0552d1c9-7edd-44ed-9954-0bc7810ec2f3",
"metadata": {},
"source": [
"Etape suivante : on peut donc calculer le potentiel de CA de chaque segment"
]
},
{
"cell_type": "code",
"execution_count": 473,
"id": "86f0740a-80b5-435b-a1ee-ae59d9143666",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>decile</th>\n",
" <th>overshoot_coeff</th>\n",
" <th>ajusted_score</th>\n",
" <th>odd_ratio</th>\n",
" <th>test_adjusted_score_2</th>\n",
" <th>score_adjusted</th>\n",
" <th>nb_tickets_projected</th>\n",
" <th>total_amount_projected</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211260</td>\n",
" <td>2.288530</td>\n",
" <td>0.533640</td>\n",
" <td>0.274689</td>\n",
" <td>2.666667</td>\n",
" <td>66.666667</td>\n",
" <td>0.732503</td>\n",
" <td>18.312587</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.063821</td>\n",
" <td>0.323109</td>\n",
" <td>0.139085</td>\n",
" <td>0.050756</td>\n",
" <td>0.666667</td>\n",
" <td>36.666667</td>\n",
" <td>0.033837</td>\n",
" <td>1.861053</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.073069</td>\n",
" <td>0.388102</td>\n",
" <td>0.162515</td>\n",
" <td>0.060349</td>\n",
" <td>11.333333</td>\n",
" <td>53.333333</td>\n",
" <td>0.683958</td>\n",
" <td>3.218627</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211328</td>\n",
" <td>2.290940</td>\n",
" <td>0.533902</td>\n",
" <td>0.274899</td>\n",
" <td>2.666667</td>\n",
" <td>80.000000</td>\n",
" <td>0.733063</td>\n",
" <td>21.991884</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>9</td>\n",
" <td>1.268598</td>\n",
" <td>0.718781</td>\n",
" <td>10.343538</td>\n",
" <td>0.837972</td>\n",
" <td>0.631228</td>\n",
" <td>22.666667</td>\n",
" <td>277.333333</td>\n",
" <td>14.307843</td>\n",
" <td>175.060667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.179296</td>\n",
" <td>1.407779</td>\n",
" <td>0.413108</td>\n",
" <td>0.188948</td>\n",
" <td>0.666667</td>\n",
" <td>44.873333</td>\n",
" <td>0.125966</td>\n",
" <td>8.478740</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.198694</td>\n",
" <td>1.894523</td>\n",
" <td>0.486458</td>\n",
" <td>0.238685</td>\n",
" <td>0.666667</td>\n",
" <td>40.940000</td>\n",
" <td>0.159123</td>\n",
" <td>9.771748</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>17.863019</td>\n",
" <td>0.006522</td>\n",
" <td>0.131865</td>\n",
" <td>0.061854</td>\n",
" <td>0.021356</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.177808</td>\n",
" <td>1.379973</td>\n",
" <td>0.408279</td>\n",
" <td>0.185910</td>\n",
" <td>0.666667</td>\n",
" <td>52.953333</td>\n",
" <td>0.123940</td>\n",
" <td>9.844555</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.066382</td>\n",
" <td>0.340487</td>\n",
" <td>0.145477</td>\n",
" <td>0.053340</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 32 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... decile \\\n",
"0 0.0 True False ... 6 \n",
"1 0.0 True True ... 2 \n",
"2 0.0 True True ... 2 \n",
"3 0.0 True False ... 6 \n",
"4 0.0 True False ... 9 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 5 \n",
"96092 1.0 True False ... 6 \n",
"96093 0.0 True True ... 1 \n",
"96094 1.0 True False ... 5 \n",
"96095 0.0 True False ... 2 \n",
"\n",
" overshoot_coeff ajusted_score odd_ratio test_adjusted_score_2 \\\n",
"0 3.294104 0.211260 2.288530 0.533640 \n",
"1 3.826401 0.063821 0.323109 0.139085 \n",
"2 3.826401 0.073069 0.388102 0.162515 \n",
"3 3.294104 0.211328 2.290940 0.533902 \n",
"4 1.268598 0.718781 10.343538 0.837972 \n",
"... ... ... ... ... \n",
"96091 3.260982 0.179296 1.407779 0.413108 \n",
"96092 3.294104 0.198694 1.894523 0.486458 \n",
"96093 17.863019 0.006522 0.131865 0.061854 \n",
"96094 3.260982 0.177808 1.379973 0.408279 \n",
"96095 3.826401 0.066382 0.340487 0.145477 \n",
"\n",
" score_adjusted nb_tickets_projected total_amount_projected \\\n",
"0 0.274689 2.666667 66.666667 \n",
"1 0.050756 0.666667 36.666667 \n",
"2 0.060349 11.333333 53.333333 \n",
"3 0.274899 2.666667 80.000000 \n",
"4 0.631228 22.666667 277.333333 \n",
"... ... ... ... \n",
"96091 0.188948 0.666667 44.873333 \n",
"96092 0.238685 0.666667 40.940000 \n",
"96093 0.021356 0.000000 0.000000 \n",
"96094 0.185910 0.666667 52.953333 \n",
"96095 0.053340 0.000000 0.000000 \n",
"\n",
" nb_tickets_expected total_amount_expected \n",
"0 0.732503 18.312587 \n",
"1 0.033837 1.861053 \n",
"2 0.683958 3.218627 \n",
"3 0.733063 21.991884 \n",
"4 14.307843 175.060667 \n",
"... ... ... \n",
"96091 0.125966 8.478740 \n",
"96092 0.159123 9.771748 \n",
"96093 0.000000 0.000000 \n",
"96094 0.123940 9.844555 \n",
"96095 0.000000 0.000000 \n",
"\n",
"[96096 rows x 32 columns]"
]
},
"execution_count": 473,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# on calcule d'abord pour chaque client le nombre de tickets achetés / montant total si achat\n",
"# comme la période d'étude est d'un an et demi, sur l'année à venir on espère vendre 1.5 fois le nbre de tickets vendu\n",
"\n",
"# ensuite, on multiplie par la proba d'achat pour avoir le nombre de tickets potentiellement acheté\n",
"# et le montant total associé\n",
"\n",
"\n",
"X_test[\"nb_tickets_projected\"] = X_test[\"nb_tickets\"] / 1.5\n",
"X_test[\"total_amount_projected\"] = X_test[\"total_amount\"] / 1.5\n",
"\n",
"X_test[\"nb_tickets_expected\"] = X_test[\"score_adjusted\"] * X_test[\"nb_tickets_projected\"]\n",
"X_test[\"total_amount_expected\"] = X_test[\"score_adjusted\"] * X_test[\"total_amount_projected\"]\n",
"\n",
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 474,
"id": "c8c8eec5-27d9-41cc-b62f-66246a24f1a4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>total_amount</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" </tr>\n",
" <tr>\n",
" <th>quartile</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.017380</td>\n",
" <td>0.475141</td>\n",
" <td>0.000590</td>\n",
" <td>0.016112</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.085810</td>\n",
" <td>49.701732</td>\n",
" <td>0.134566</td>\n",
" <td>3.298096</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.118100</td>\n",
" <td>88.811284</td>\n",
" <td>0.478898</td>\n",
" <td>13.258736</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>46.046362</td>\n",
" <td>2002.607230</td>\n",
" <td>26.753314</td>\n",
" <td>1246.363503</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_tickets total_amount nb_tickets_expected total_amount_expected\n",
"quartile \n",
"1 0.017380 0.475141 0.000590 0.016112\n",
"2 2.085810 49.701732 0.134566 3.298096\n",
"3 3.118100 88.811284 0.478898 13.258736\n",
"4 46.046362 2002.607230 26.753314 1246.363503"
]
},
"execution_count": 474,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# potentiel de CA par segment, et comparaison avec le CA passé/1.5\n",
"\n",
"X_test.groupby(\"quartile\")[[\"nb_tickets\",\"total_amount\",\"nb_tickets_expected\",\"total_amount_expected\"]].mean()"
]
},
{
"cell_type": "code",
"execution_count": 519,
"id": "f7052cc7-054b-4b9d-935e-81611b1f6a61",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
" <th>nb_tickets</th>\n",
" <th>total_amount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>554.666667</td>\n",
" <td>1.516365e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>23950.666667</td>\n",
" <td>5.707084e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>46732.000000</td>\n",
" <td>1.331044e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>261543.333333</td>\n",
" <td>1.137481e+07</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile nb_tickets total_amount\n",
"0 1 554.666667 1.516365e+04\n",
"1 2 23950.666667 5.707084e+05\n",
"2 3 46732.000000 1.331044e+06\n",
"3 4 261543.333333 1.137481e+07"
]
},
"execution_count": 519,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_expected_CA = (X_test.groupby(\"quartile\")[[\"nb_tickets\",\"total_amount\"]].sum()/1.5).reset_index()\n",
"df_expected_CA"
]
},
{
"cell_type": "code",
"execution_count": 520,
"id": "655c499e-29d2-4811-bba2-e4184bc123e5",
"metadata": {},
"outputs": [],
"source": [
"df_expected_CA[[\"nb_tickets_expected\",\"total_amount_expected\"]] = (X_test.groupby(\"quartile\")[[\"nb_tickets_expected\",\"total_amount_expected\"]].sum()).reset_index()[[\"nb_tickets_expected\", \"total_amount_expected\"]]"
]
},
{
"cell_type": "code",
"execution_count": 521,
"id": "917891a5-8906-4c19-96ff-5160fb437a86",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
" <th>nb_tickets</th>\n",
" <th>total_amount</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>554.666667</td>\n",
" <td>1.516365e+04</td>\n",
" <td>28.262185</td>\n",
" <td>7.713112e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>23950.666667</td>\n",
" <td>5.707084e+05</td>\n",
" <td>2317.763439</td>\n",
" <td>5.680641e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>46732.000000</td>\n",
" <td>1.331044e+06</td>\n",
" <td>10766.103277</td>\n",
" <td>2.980696e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>261543.333333</td>\n",
" <td>1.137481e+07</td>\n",
" <td>227938.234982</td>\n",
" <td>1.061902e+07</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile nb_tickets total_amount nb_tickets_expected \\\n",
"0 1 554.666667 1.516365e+04 28.262185 \n",
"1 2 23950.666667 5.707084e+05 2317.763439 \n",
"2 3 46732.000000 1.331044e+06 10766.103277 \n",
"3 4 261543.333333 1.137481e+07 227938.234982 \n",
"\n",
" total_amount_expected \n",
"0 7.713112e+02 \n",
"1 5.680641e+04 \n",
"2 2.980696e+05 \n",
"3 1.061902e+07 "
]
},
"execution_count": 521,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_expected_CA"
]
},
{
"cell_type": "code",
"execution_count": 532,
"id": "6b90ea7d-37be-49e4-b0c2-b38a37058e24",
"metadata": {},
"outputs": [],
"source": [
"# add number of customers of each segment\n",
"df_expected_CA.insert(1, \"size\", X_test.groupby(\"quartile\").size().values)"
]
},
{
"cell_type": "code",
"execution_count": 535,
"id": "7efab307-0a98-4049-afe6-b292fa3c4036",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>quartile</th>\n",
" <th>size</th>\n",
" <th>nb_tickets</th>\n",
" <th>total_amount</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" <th>total_amount_recovered</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>47871</td>\n",
" <td>554.666667</td>\n",
" <td>1.516365e+04</td>\n",
" <td>28.262185</td>\n",
" <td>7.713112e+02</td>\n",
" <td>0.050866</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>17224</td>\n",
" <td>23950.666667</td>\n",
" <td>5.707084e+05</td>\n",
" <td>2317.763439</td>\n",
" <td>5.680641e+04</td>\n",
" <td>0.099537</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>22481</td>\n",
" <td>46732.000000</td>\n",
" <td>1.331044e+06</td>\n",
" <td>10766.103277</td>\n",
" <td>2.980696e+05</td>\n",
" <td>0.223937</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>8520</td>\n",
" <td>261543.333333</td>\n",
" <td>1.137481e+07</td>\n",
" <td>227938.234982</td>\n",
" <td>1.061902e+07</td>\n",
" <td>0.933556</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" quartile size nb_tickets total_amount nb_tickets_expected \\\n",
"0 1 47871 554.666667 1.516365e+04 28.262185 \n",
"1 2 17224 23950.666667 5.707084e+05 2317.763439 \n",
"2 3 22481 46732.000000 1.331044e+06 10766.103277 \n",
"3 4 8520 261543.333333 1.137481e+07 227938.234982 \n",
"\n",
" total_amount_expected total_amount_recovered \n",
"0 7.713112e+02 0.050866 \n",
"1 5.680641e+04 0.099537 \n",
"2 2.980696e+05 0.223937 \n",
"3 1.061902e+07 0.933556 "
]
},
"execution_count": 535,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_expected_CA[\"total_amount_recovered\"] = df_expected_CA[\"total_amount_expected\"]/df_expected_CA[\"total_amount\"]\n",
"df_expected_CA"
]
},
{
"cell_type": "code",
"execution_count": 539,
"id": "00cc2db8-d20b-4a0b-846c-c6199c58a834",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>count</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>min</th>\n",
" <th>25%</th>\n",
" <th>50%</th>\n",
" <th>75%</th>\n",
" <th>max</th>\n",
" </tr>\n",
" <tr>\n",
" <th>quartile</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>47871.0</td>\n",
" <td>0.033442</td>\n",
" <td>0.013951</td>\n",
" <td>0.019591</td>\n",
" <td>0.019867</td>\n",
" <td>0.023766</td>\n",
" <td>0.048136</td>\n",
" <td>0.052262</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17224.0</td>\n",
" <td>0.088246</td>\n",
" <td>0.028737</td>\n",
" <td>0.052283</td>\n",
" <td>0.060481</td>\n",
" <td>0.082054</td>\n",
" <td>0.115089</td>\n",
" <td>0.141983</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>22481.0</td>\n",
" <td>0.222962</td>\n",
" <td>0.048039</td>\n",
" <td>0.141993</td>\n",
" <td>0.183323</td>\n",
" <td>0.219550</td>\n",
" <td>0.268865</td>\n",
" <td>0.331754</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>8520.0</td>\n",
" <td>0.652198</td>\n",
" <td>0.201486</td>\n",
" <td>0.332049</td>\n",
" <td>0.473052</td>\n",
" <td>0.640295</td>\n",
" <td>0.827644</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" count mean std min 25% 50% 75% \\\n",
"quartile \n",
"1 47871.0 0.033442 0.013951 0.019591 0.019867 0.023766 0.048136 \n",
"2 17224.0 0.088246 0.028737 0.052283 0.060481 0.082054 0.115089 \n",
"3 22481.0 0.222962 0.048039 0.141993 0.183323 0.219550 0.268865 \n",
"4 8520.0 0.652198 0.201486 0.332049 0.473052 0.640295 0.827644 \n",
"\n",
" max \n",
"quartile \n",
"1 0.052262 \n",
"2 0.141983 \n",
"3 0.331754 \n",
"4 1.000000 "
]
},
"execution_count": 539,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# la part de CA recouvrée est tjs supérieure à la part de clients qui reviennent\n",
"# ça semble logique : ceux qui reviennent sont aussi ceux qui consomment le plus \n",
"# se voit srtt sur dernier quartile : on récupère 65% des clients (avec probas ajustées) mais 93% du CA \n",
"X_test.groupby(\"quartile\")[\"score_adjusted\"].describe()"
]
},
{
"cell_type": "markdown",
"id": "59a0850a-c40d-472a-9361-e96840e2b046",
"metadata": {},
"source": [
"## Study potential of each segment"
]
},
{
"cell_type": "code",
"execution_count": 180,
"id": "1773bac2-ab5e-4bca-bda5-aa13e36991e5",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1wAAAIjCAYAAAAX5hpkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAB8hklEQVR4nO3dd3QV1eL28eek90MSSEIkQIDQQxcIoIB0jKiooHARBHuBKEW5XAUsVCleEQREooDEBlxskSJE6UUiUkRRqiSAGkJPINnvH7yZH4cESEIOoXw/a521cmb2zOw5s095smf22IwxRgAAAACAIudS3BUAAAAAgBsVgQsAAAAAnITABQAAAABOQuACAAAAACchcAEAAACAkxC4AAAAAMBJCFwAAAAA4CQELgAAAABwEgIXAAAAADgJgQtAnrKystS8eXM1a9ZMp0+fLu7qAACuQ3yXAASu61p8fLxsNpv18PLyUlhYmFq2bKmRI0fq0KFDuZYZNmyYbDZbgbZz8uRJDRs2TMuXLy/Qcnltq3z58oqNjS3Qei7no48+0sSJE/OcZ7PZNGzYsCLd3pUozOvvLC1atFCLFi0uOn/IkCE6dOiQFi5cKC8vL4d55cuXV69evQq13StZFs5x4MABDRs2TMnJycVdFfXq1Uvly5cv7mpcNUX9mVjYz+uiUFTtKOe7bffu3UVSr+XLl8tmszm8JsOGDbtm2tm19j2VX5MnT1Z8fPxly13qu6QgLvedVVSK+3u6uLdfWLt375bNZtObb75ZZOvctm2bhg0bVmSfBcXJrbgrgCs3c+ZMVa1aVWfOnNGhQ4e0YsUKjR49Wm+++aY+/vhjtW7d2ir76KOPqn379gVa/8mTJzV8+HBJKtCHXWG2VRgfffSRtmzZori4uFzzVq9erTJlyji9Djear776SrNmzdKqVasUFBRUpOueP3++AgICinSduDIHDhzQ8OHDVb58edWpU6e4q4MrUNjP66JAOyqc6/V7avLkySpZsuQl/4HmzO+SG9XV+u10Pdi2bZuGDx+uFi1aXDP/ICksAtcNoGbNmmrQoIH1/L777tPzzz+vZs2aqXPnzvrtt98UGhoqSSpTpozTP9hPnjwpHx+fq7Kty2ncuHGxbv96deedd+rPP/90yrrr1q3rlPXi6sl5jwO4cjfy95Qzv0uuFwX9vCzq3058Xl8bOKXwBlW2bFmNGzdOx44d09SpU63peXVVf/fdd2rRooWCg4Pl7e2tsmXL6r777tPJkye1e/dulSpVSpI0fPhw6/TFnP9o5azvxx9/1P3336/AwEBVrFjxotvKMX/+fNWqVUteXl6qUKGC/vvf/zrMv9gpJReeGtKiRQt99dVX2rNnj8PplTnyOlVjy5YtuvvuuxUYGCgvLy/VqVNHH3zwQZ7bmTt3roYMGaLw8HAFBASodevW2rFjx8Vf+PN89dVXqlOnjjw9PRUZGXnRbnZjjCZPnqw6derI29tbgYGBuv/++/XHH384lNu0aZNiY2MVEhIiT09PhYeH684779T+/fsvWQ9jjMaMGaNy5crJy8tL9erV0zfffJNn2aNHj2rAgAGKjIyUh4eHbrnlFsXFxenEiROX3Mbp06fVv39/1alTR3a7XUFBQYqJidH//ve/XGUvPKUwOztbr7/+uqpUqSJvb2+VKFFCtWrV0ltvvXXJbeYco48++kgvvviiSpcuLT8/P9111106ePCgjh07pscff1wlS5ZUyZIl9cgjj+j48eO56j148GCH/X3mmWd05MgRq0yfPn0UFBSkkydP5qrDHXfcoRo1aljP83ssW7RooZo1a2r9+vW67bbb5OPjowoVKmjUqFHKzs6+5H4X9XaWL1+uW2+9VZL0yCOPWO+hnPdNr1695Ofnp59//llt27aVv7+/WrVqJUnKzMzU66+/rqpVq8rT01OlSpXSI488osOHD192H6Rz7/MqVarI09NT1apV04cffphnufxu52KnZp3f5owx6tixo4KDg7V3716rzMmTJ1WjRg1Vq1btku29IG09Oztbb7/9tnWcSpQoocaNG2vhwoW5yiYmJqpevXry9vZW1apV9f777zvMP3z4sJ5++mlVr15dfn5+CgkJ0R133KEffvjBKnO5z+vDhw/r8ccfV0REhPU6Nm3aVEuWLLno/ub47bff1K1bN+vzp1q1anrnnXes+ZdrRxezZs0aNW3aVF5eXgoPD9fgwYN15syZPMt+/PHHiomJka+vr/z8/NSuXTtt2rTpsnXPr3feeUe33367QkJC5Ovrq+joaI0ZMyZXfQry/j1y5Ij69++vChUqyNPTUyEhIerYsaN++eUXq0xer1NqaqqeeOIJlSlTRh4eHoqMjNTw4cN19uxZq8z5p3CNHz9ekZGR8vPzU0xMjNasWeOwvpz38c6dO9WxY0f5+fkpIiJC/fv3V0ZGhkPZ/Lzfypcvr61btyopKck61uf3QBT2u0TK/3dWfn8nXEpRf0/ntI3vv/9eTZo0kY+Pj3r37l2gY3Wx3075af+X+rzOy86dO/XII48oKipKPj4+uuWWW3TXXXfp559/zlU2P205x+X2ccOGDXrwwQdVvnx5eXt7q3z58nrooYe0Z88eq0x8fLweeOABSVLLli2tdpaf01ivSQbXrZkzZxpJZv369XnOP378uHF1dTWtWrWypg0dOtScf9h37dplvLy8TJs2bcyCBQvM8uXLzZw5c0yPHj1MWlqaOX36tElMTDSSTJ8+fczq1avN6tWrzc6dOx3WV65cOfPiiy+axYsXmwULFuS5LWOMKVeunLnllltM2bJlzfvvv2++/vpr0717dyPJjB07Nte+7dq1y2H5ZcuWGUlm2bJlxhhjtm7dapo2bWrCwsKsuq1evdoqL8kMHTrUev7LL78Yf39/U7FiRfPhhx+ar776yjz00ENGkhk9enSu7ZQvX950797dfPXVV2bu3LmmbNmyJioqypw9e/aSx2bJkiXG1dXVNGvWzMybN898+umn5tZbbzVly5bN9Zo89thjxt3d3fTv398kJiaajz76yFStWtWEhoaa1NRU61gGBwebBg0amE8++cQkJSWZjz/+2Dz55JNm27Ztl6xLznHo06eP+eabb8y0adPMLbfcYsLCwkzz5s2tcidOnDB16tQxJUuWNOPHjzdLliwxb731lrHb7eaOO+4w2dnZDsexZ8+e1vMjR46YXr16mVmzZpnvvvvOJCYmmgEDBhgXFxfzwQcfONTnwmVHjhxpXF1dzdChQ83SpUtNYmKimThxohk2bNgl9yvnGJUrV8706tXLJCYmmnfffdf4+fmZli1bmjZt2pgBAwaYRYsWmdGjRxtXV1fz3HPPWctnZ2ebdu3aGTc3N/Pyyy+bRYsWmTfffNP4+vqaunXrmtOnTxtjjPnpp5+MJDN9+nSH7W/dutVIMu+8806BjqUxxjRv3twEBwebqKgo8+6775rFixebp59+2kjK9XrlpSi3k56ebr3f/vOf/1jvoX379hljjOnZs6dxd3c35cuXNyNHjjRLly413377rcnKyjLt27c3vr6+Zvjw4Wbx4sXmvffeM7fccoupXr26OXny5CX3IWebd999t/niiy/M7NmzTaVKlUxERIQpV66cVa4g27nw/Z7jwjb3119/mTJlyphGjRqZzMxMaz+9vb3N5s2bL1nvgrT1Hj16GJvNZh599FHzv//9z3zzzTfmjTfeMG+99ZZD3cqUKWOqV69uPvzwQ/Ptt9+aBx54wEgySUlJVrlffvnFPPXUUyYhIcEsX77cfPnll6ZPnz7GxcXF+jy83Od1u3btTKlSpcy0adPM8uXLzYIFC8wrr7xiEhISLrnPW7duNXa73URHR5sPP/zQLFq0yPTv39+4uLhY79PLtaOLrdfHx8dUr17dzJ071/zvf/8z7dq1sz4nz//8f+ONN4zNZjO9e/c2X375pZk3b56JiYkxvr6+ZuvWrZes/4XfGxfz/PPPmylTppjExETz3XffmQkTJpiSJUuaRx55xKFcft+/R48eNTVq1DC+vr7m1VdfNd9++635/PPPTb9+/cx3331nlbu
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# is pace of purchase a good measure ? \n",
"# we ll compare the avg purchase delay and the purchase date max\n",
"\n",
"plt.figure(figsize = [10,6])\n",
"\n",
"plt.hist(X_test[X_test[\"avg_purchase_delay\"]>0][\"avg_purchase_delay\"], alpha = 0.5, label = \"average purchase delay\")\n",
"plt.hist(X_test[X_test[\"avg_purchase_delay\"]>0][\"purchase_date_max\"], alpha=0.5, label = \"recency of the last purchase\")\n",
"plt.legend()\n",
"plt.xlabel(\"durée (jours)\")\n",
"plt.ylabel(\"fréquence\")\n",
"plt.title(\"Distribution des délais moyen entre deux achats et de l'ancienneté du dernier achat\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 199,
"id": "3ef409fe-dcf7-4c07-9be3-28b3e8ca5546",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2IAAAIjCAYAAABh3KjvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAACMHklEQVR4nOzdeXhNV/v/8c8RmROHhCSiIRRBzRRBixJUVVWLojG2OhtqaD2qpYOxhj5VY2toUTrgUdUQitY8hhbVao2VoKQxlIRk/f7wy/46mUTEifJ+XVcuzt5r73Xv4ex97rPWXsdmjDECAAAAADhNvrwOAAAAAADuNiRiAAAAAOBkJGIAAAAA4GQkYgAAAADgZCRiAAAAAOBkJGIAAAAA4GQkYgAAAADgZCRiAAAAAOBkJGIAAAAA4GQkYgBu2H/+8x8FBgbq999/z+tQgH+dadOmydvbW5s3b87rUADgttapUyeFhYXpr7/+yutQbgkSsevYvXu3unXrppIlS8rDw0M+Pj6qXr26Ro8erTNnzuR1eLeNWbNmyWaz6dChQ3kdilPs3btXQ4cOder2du3aVaGhobe8nqFDh8pms2U6/7vvvtPEiRO1dOlS3XvvvQ7zbibGG1l20qRJmjVrVo7quR2sWbNGNptNa9assaZ17dpVDRs2vGV1NmzYUBUrVszVdQ4fPlyLFy/O1XXeKs56/1xPTEyMXn31Vc2fP1+1a9fO9nK3Iv7Q0FB17do1V9eZnXpSz/9bdf1MvR9t27Yt19a5bNkyDR06NNfWl2rDhg0aOnSo/v7771xfd247dOiQbDabw7U3dV/fTUJDQ9WyZcvrlstof90qNpstV89PZ10brmfKlClatWqVoqKiVLhw4Wwvl9vx38pjSSKWhenTp6tGjRraunWrBgwYoKioKC1atEht27bVlClT1KNHj7wO8bbxyCOPaOPGjSpatGheh+IUe/fu1bBhw+6axDPV0aNH1a1bNy1YsED3339/rq57yJAhWrRoUbbK/tsTsTvFvykRux2cPXtWbdu21bhx4/Too4/e0LI38v5A7lu2bJmGDRuW6+vdsGGDhg0b9q9IxABn2rFjh4YMGaJly5apZMmSN7TsokWLNGTIkFsUWe7Kn9cB3K42btyoF154QREREVq8eLHc3d2teREREerXr5+ioqLyMMJb659//pGXl1e2yxcpUkRFihS5hRH9u93o/rxdhYSEKC4u7pasO23rGnCjLl68KE9Pz7wOI1MFChTQb7/9dkPLpF47eH/AGS5evCgPD4+7rpULt5/q1avr1KlTN7RM6j2gWrVqtyiq3EeLWCaGDx8um82madOmOSRhqdzc3NSqVSvrdUpKikaPHq1y5crJ3d1dAQEB6ty5s44dO+awXGr3oI0bN6pu3bry9PRUaGioZs6cKUn69ttvVb16dXl5ealSpUrpkr3UbmM7d+5UmzZtVKBAAdntdj399NPpTtgFCxaoadOmKlq0qDw9PVW+fHm9/vrrunDhgkO5rl27ysfHRz/99JOaNm0qX19fNW7cWJIUHR2txx57TPfcc488PDxUunRpPffcc+n66mbUNXHnzp1q2bKlAgIC5O7uruDgYD3yyCMO++TSpUsaNGiQSpYsKTc3NxUrVkwvvfRSum8HU7sCREVFqXr16vL09FS5cuU0Y8aMjA5fOklJSXr33Xet41OkSBF169Yt3T7LTj2zZs1S27ZtJUmNGjWSzWZzaLJOPcY//PCD6tatKy8vL3Xv3l3S1W/E+/fv77C9ffr0SXdMsssYo0mTJqlq1ary9PRUoUKF9OSTT+qPP/7I1vLffvutqlatKnd3d5UsWVLvv/9+rtfz0Ucf6cEHH1RAQIC8vb1VqVIljR49WpcvX3Yol92uV6GhodqzZ4/Wrl1r7fvQ0FCdP39eBQsW1HPPPZdumUOHDsnFxUVjxoyR9H/na3R0tLp16yY/Pz95e3vr0UcfzXCbVq5cqcaNG6tAgQLy8vJSvXr1tGrVquvGKkm//PKLmjdvLi8vLxUuXFjPP/+8zp07l61lhw0bptq1a8vPz08FChRQ9erV9cknn8gYk67svHnzFB4eLh8fH/n4+Khq1ar65JNP0pXbunWrHnjgAXl5ealUqVIaOXKkUlJSrPmXLl1Sv379VLVqVdntdvn5+Sk8PFz/+9//HNZjs9l04cIFzZ492zoO1+taOXnyZFWpUkU+Pj7y9fVVuXLl9J///Mean1m32IyuL6nv1YULF6patWry8PC44RaL7J6bGbmRa7F09XocHh4ub29v+fj4qFmzZtq5c6dDmayuxRm9P7J7/bx8+bIGDhyooKAgeXl5qX79+tqyZUuG2xUXF6fnnntO99xzj9zc3FSyZEkNGzZMV65cue4+uZF60sruvUa6+p7q0KGDAgMD5e7uruLFi6tz585KTEx0KHfu3Dm98MILKly4sPz9/dWmTRsdP37coUx27pNdu3bVRx99JEnWuZ6drpXXu24MHTpUAwYMkCSVLFnSWu+1XZbT+uOPP/TUU08pODhY7u7uCgwMVOPGjRUTE2OVyaybWtruWqnvqxUrVqh79+4qUqSIvLy80u3HnLiR8/3AgQNq0aKFfHx8FBISon79+qWL4XrXDil7525qF7MxY8Zo1KhRCg0Nlaenpxo2bKhff/1Vly9f1uuvv67g4GDZ7XY9/vjjOnnyZIbbuGjRIlWuXFkeHh4qVaqU/vvf/2Zr3/z222/q2LGj9dmofPny1vl1PWfPntWzzz4rf39/+fj4qHnz5vr1119zvZ60sntfyEzq56Iff/xRderUkaenp4oVK6YhQ4YoOTnZoeyNfl7L6B6QUdfEI0eO6Omnn3bYH2PHjnW4/0nS8ePH1a5dO/n6+sput6t9+/aZfgG9bds2tWrVSn5+fvLw8FC1atX0xRdfZGufWAzSuXLlivHy8jK1a9fO9jI9e/Y0kszLL79soqKizJQpU0yRIkVMSEiIOXXqlFWuQYMGxt/f34SFhZlPPvnELF++3LRs2dJIMsOGDTOVKlUyn3/+uVm2bJmpU6eOcXd3N3/++ae1/FtvvWUkmRIlSpgBAwaY5cuXm3Hjxhlvb29TrVo1k5SUZJV95513zPjx4823335r1qxZY6ZMmWJKlixpGjVq5BB7ly5djKurqwkNDTUjRowwq1atMsuXLzfGGDN58mQzYsQIs2TJErN27Voze/ZsU6VKFRMWFuZQ18yZM40kc/DgQWOMMefPnzf+/v6mZs2a5osvvjBr1641CxYsMM8//7zZu3evMcaYlJQU06xZM5M/f34zZMgQs2LFCvP+++9b23Lp0iVr/SVKlDD33HOPqVChgvn000/N8uXLTdu2bY0ks3bt2iyPTXJysmnevLnx9vY2w4YNM9HR0ebjjz82xYoVMxUqVDD//PPPDdVz8uRJM3z4cCPJfPTRR2bjxo1m48aN5uTJk9Yx9vPzMyEhIebDDz80q1evNmvXrjUXLlwwVatWNYULFzbjxo0zK1euNB988IGx2+3moYceMikpKVluR5cuXUyJEiUcpj377LPG1dXV9OvXz0RFRZl58+aZcuXKmcDAQBMXF5fl+lauXGlcXFxM/fr1zcKFC82XX35p7r//flO8eHGT9tKQ3XoyirFv375m8uTJJioqynz//fdm/PjxpnDhwqZbt27X3b6M7Nixw5QqVcpUq1bN2vc7duyw6vL29jZ///23wzIDBgwwHh4e5q+//jLG/N/5GhISYrp3726+++47M23aNBMQEGBCQkJMfHy8texnn31mbDabad26tVm4cKH55ptvTMuWLY2Li4tZuXJllrHGxcWZgIAAU6xYMTNz5kyzbNky06lTJ2sfr169Osvlu3btaj755BMTHR1toqOjzTvvvGM8PT3NsGHDHMoNGTLESDJt2rQxX375pVmxYoUZN26cGTJkiFUm9dpTpkwZM2XKFBMdHW1efPFFI8nMnj3bKvf333+brl27ms8++8x8//33JioqyvTv39/ky5fPodzGjRuNp6enadGihXUc9uzZk+m2fP7550aSeeWVV8yKFSvMypUrzZQ
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize = [10,6])\n",
"\n",
"plt.hist(X_test[X_test[\"avg_purchase_delay\"]>0][\"avg_purchase_delay\"], alpha = 0.5, label = \"average purchase delay on the purchasing period\")\n",
"plt.hist(X_test[X_test[\"avg_purchase_delay\"]>0][\"purchase_date_min\"]/X_test[X_test[\"avg_purchase_delay\"]>0][\"nb_purchases\"], alpha=0.5, label = \"average purchase delay on the full period\")\n",
"plt.legend()\n",
"plt.xlabel(\"durée (jours)\")\n",
"plt.ylabel(\"fréquence\")\n",
"plt.title(\"Comparaison entre le délai-type d'achat sur la période d'achat et sur l'ensemble de la période\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "2a46a811-9169-43e2-a759-461562f4f250",
"metadata": {},
"source": [
"Il vaut mieux prendre le rythme en considérant purchase date min au dénominateur plutôt que le délai entre le \n",
"1er et le dernier achat"
]
},
{
"cell_type": "code",
"execution_count": 192,
"id": "fad27180-e1f2-4876-b0b8-2254c342fc36",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1.473400e+04\n",
"mean 9.011960e+07\n",
"std 8.222514e+08\n",
"min 0.000000e+00\n",
"25% 7.194159e-01\n",
"50% 3.564579e+00\n",
"75% 2.645439e+01\n",
"max 1.996151e+10\n",
"dtype: float64"
]
},
"execution_count": 192,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(X_test[X_test[\"avg_purchase_delay\"]>0][\"purchase_date_max\"]/X_test[X_test[\"avg_purchase_delay\"]>0][\"avg_purchase_delay\"]).describe()"
]
},
{
"cell_type": "code",
"execution_count": 196,
"id": "c232ced3-c9b2-4e35-b89b-c18f7c99dc7a",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAGdCAYAAACyzRGfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAZT0lEQVR4nO3dUWyd913/8c8ZpQd7tT26rT6x6rZBuEu2rNuSTqEOkAyWSGHqnykTDNJM5YIpJR0QKhQIkf54E9gQtChI1qK1F22mKpSbrUyCdbGEllxEFa6jiCpN2k5LmaE1Vqdgu43liPb8L6qcf9103U5y8nNO+npJj7bze55z/PVu/N4vj/1U6vV6PQAAhbxnqQcAAN5dxAcAUJT4AACKEh8AQFHiAwAoSnwAAEWJDwCgKPEBABR13VIP8Favv/56XnzxxXR1daVSqSz1OADAT6Fer2dubi59fX15z3veeW/jqouPF198Mf39/Us9BgBwCSYnJ3PzzTe/4zVXXXx0dXUleWP47u7uJZ4GAPhpzM7Opr+/v/Fz/J1cdfFx4Z9auru7xQcAtJmf5pYJN5wCAEWJDwCgKPEBABQlPgCAosQHAFCU+AAAihIfAEBR4gMAKEp8AABFiQ8AoCjxAQAUJT4AgKKuugfLAVefc+fO5fTp05f9OfPz83nhhRdy2223paOjowWTJStWrEhnZ2dLPgsoQ3wAP9Hp06ezZs2apR7jbU1MTGT16tVLPQbQBPEB/EQrVqzIxMTEZX/OqVOnsm3btjz66KNZuXJlCyZ7YzagvYgP4Cfq7Oxs6e7CypUr7VbAu5gbTgGAosQHAFCU+AAAihIfAEBR4gMAKEp8AABFiQ8AoCjxAQAUJT4AgKLEBwBQlPgAAIoSHwBAUeIDAChKfAAARYkPAKAo8QEAFCU+AICixAcAUJT4AACKajo+/uu//ivbtm3L+9///nR2dubjH/94JiYmGufr9XqGhobS19eXjo6ObNiwISdPnmzp0ABA+2oqPs6ePZt169blZ3/2Z/Od73wnzzzzTL761a/mfe97X+OavXv3Zt++fRkdHc34+HhqtVo2btyYubm5Vs8OALSh65q5+G//9m/T39+fhx9+uLF22223Nf57vV7P/v37s2fPnmzZsiVJcvDgwfT29ubQoUPZvn17a6YGANpWUzsf3/72t3PnnXfmt37rt3LTTTflE5/4RB566KHG+TNnzmRqaiqbNm1qrFWr1axfvz7Hjh17289cWFjI7OzsogMAuHY1FR8/+MEPcuDAgQwMDOS73/1u7rvvvvzRH/1RvvGNbyRJpqamkiS9vb2L3tfb29s491YjIyPp6elpHP39/ZfyfQAAbaKp+Hj99dezevXqDA8P5xOf+ES2b9+eL37xizlw4MCi6yqVyqLX9Xr9orULdu/enZmZmcYxOTnZ5LcAALSTpuJj2bJl+fCHP7xobeXKlfnhD3+YJKnVakly0S7H9PT0RbshF1Sr1XR3dy86AIBrV1PxsW7dujz77LOL1p577rnceuutSZLly5enVqtlbGyscf78+fM5cuRIBgcHWzAuANDumvptlz/5kz/J4OBghoeH89u//dv5t3/7tzz44IN58MEHk7zxzy07d+7M8PBwBgYGMjAwkOHh4XR2dmbr1q1X5BsAANpLU/HxyU9+Mt/61reye/fufOUrX8ny5cuzf//+3HPPPY1rdu3alfn5+ezYsSNnz57N2rVrc/jw4XR1dbV8eACg/VTq9Xp9qYd4s9nZ2fT09GRmZsb9H3CNOX78eNasWZOJiYmsXr16qccBWqiZn9+e7QIAFCU+AICixAcAUJT4AACKEh8AQFHiAwAoSnwAAEWJDwCgKPEBABQlPgCAosQHAFCU+AAAihIfAEBR4gMAKEp8AABFiQ8AoCjxAQAUJT4AgKLEBwBQlPgAAIoSHwBAUeIDAChKfAAARYkPAKAo8QEAFCU+AICixAcAUJT4AACKEh8AQFHiAwAoSnwAAEWJDwCgKPEBABQlPgCAosQHAFCU+AAAihIfAEBR4gMAKEp8AABFiQ8AoCjxAQAU1VR8DA0NpVKpLDpqtVrjfL1ez9DQUPr6+tLR0ZENGzbk5MmTLR8aAGhfTe98fOQjH8lLL73UOJ5++unGub1792bfvn0ZHR3N+Ph4arVaNm7cmLm5uZYODQC0r6bj47rrrkutVmscH/zgB5O8seuxf//+7NmzJ1u2bMmqVaty8ODBnDt3LocOHWr54ABAe2o6Pp5//vn09fVl+fLl+Z3f+Z384Ac/SJKcOXMmU1NT2bRpU+PaarWa9evX59ixYz/28xYWFjI7O7voAACuXU3Fx9q1a/ONb3wj3/3ud/PQQw9lamoqg4OD+dGPfpSpqakkSW9v76L39Pb2Ns69nZGRkfT09DSO/v7+S/g2AIB20VR8bN68OZ/73Ofy0Y9+NJ/+9Kfzz//8z0mSgwcPNq6pVCqL3lOv1y9ae7Pdu3dnZmamcUxOTjYzEgDQZi7rV23f+9735qMf/Wief/75xm+9vHWXY3p6+qLdkDerVqvp7u5edAAA167Lio+FhYWcOnUqy5Yty/Lly1Or1TI2NtY4f/78+Rw5ciSDg4OXPSgAcG24rpmL//RP/zR33313brnllkxPT+ev/uqvMjs7m3vvvTeVSiU7d+7M8PBwBgYGMjAwkOHh4XR2dmbr1q1Xan4AoM00FR//+Z//md/93d/Nyy+/nA9+8IP5pV/6pTz55JO59dZbkyS7du3K/Px8duzYkbNnz2bt2rU5fPhwurq6rsjwAED7qdTr9fpSD/Fms7Oz6enpyczMjPs/4Bpz/PjxrFmzJhMTE1m9evVSjwO0UDM/vz3bBQAoSnwAAEWJDwCgKPEBABQlPgCAosQHAFCU+AAAihIfAEBR4gMAKEp8AABFiQ8AoCjxAQAUJT4AgKLEBwBQlPgAAIoSHwBAUeIDAChKfAAARYkPAKAo8QEAFCU+AICixAcAUJT4AACKEh8AQFHiAwAoSnwAAEWJDwCgKPEBABQlPgCAosQHAFCU+AAAihIfAEBR4gMAKEp8AABFiQ8AoCjxAQAUJT4AgKLEBwBQlPgAAIoSHwBAUeIDACjqsuJjZGQklUolO3fubKzV6/UMDQ2lr68vHR0d2bBhQ06ePHm5cwIA14hLjo/x8fE8+OCDueOOOxat7927N/v27cvo6GjGx8dTq9WycePGzM3NXfawAED7u6T4eOWVV3LPPffkoYceys///M831uv1evbv3589e/Zky5YtWbVqVQ4ePJhz587l0KFDLRsaAGhflxQf999/fz7zmc/k05/+9KL1M2fOZGpqKps2bWqsVavVrF+/PseOHXvbz1pYWMjs7OyiAwC4dl3X7Bsee+yxHD9+POPj4xedm5qaSpL09vYuWu/t7c1//Md/vO3njYyM5Mtf/nKzYwAAbaqpnY/Jycn88R//cR599NH83M/93I+9rlKpLHpdr9cvWrtg9+7dmZmZaRyTk5PNjAQAtJmmdj4mJiYyPT2dNWvWNNZee+21HD16NKOjo3n22WeTvLEDsmzZssY109PTF+2GXFCtVlOtVi9ldgCgDTW18/Hrv/7refrpp3PixInGceedd+aee+7JiRMn8gu/8Aup1WoZGxtrvOf8+fM5cuRIBgcHWz48ANB+mtr56OrqyqpVqxatvfe978373//+xvrOnTszPDycgYGBDAwMZHh4OJ2dndm6dWvrpgYA2lbTN5z+JLt27cr8/Hx27NiRs2fPZu3atTl8+HC6urpa/aUAgDZUqdfr9aUe4s1mZ2fT09OTmZmZdHd3L/U4QAsdP348a9asycTERFavXr3U4wAt1MzPb892AQCKEh8AQFHiAwAoSnwAAEWJDwCgKPEBABQlPgCAosQHAFCU+AAAihIfAEBR4gMAKEp8AABFiQ8AoCjxAQAUJT4AgKLEBwBQlPgAAIoSHwBAUeIDAChKfAAARYkPAKAo8QEAFCU+AICixAcAUJT4AACKEh8AQFHiAwAoSnwAAEWJDwCgKPEBABQlPgCAosQHAFCU+AAAihIfAEBR4gMAKEp
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.boxplot(X_test[X_test[\"avg_purchase_delay\"]>0][\"purchase_date_max\"]/X_test[X_test[\"avg_purchase_delay\"]>0][\"avg_purchase_delay\"], showfliers=False)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 188,
"id": "cdc917b9-eb2e-443f-8376-9a4ec4d24074",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 14734.000000\n",
"mean 145.979256\n",
"std 123.403697\n",
"min 0.000000\n",
"25% 38.053773\n",
"50% 111.560918\n",
"75% 225.056992\n",
"max 546.378919\n",
"Name: purchase_date_max, dtype: float64"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[X_test[\"avg_purchase_delay\"]>0][\"purchase_date_max\"].describe()"
]
},
{
"cell_type": "markdown",
"id": "d386e36f-deba-43c9-8a51-eba868b39f0e",
"metadata": {},
"source": [
"Il est plus pertinent de considérer l'ensemble de la période que de couper à la date du dernier achat \\\n",
"On définit donc avg purchase delay all comme le délai moyen entre deux achats depuis que le client est \n",
"connu et jusqu'a aujourd'hui"
]
},
{
"cell_type": "code",
"execution_count": 202,
"id": "71b6ff7e-c48c-45b7-bc1a-70dafd11fbf1",
"metadata": {},
"outputs": [],
"source": [
"X_test[\"avg_purchase_delay_all\"] = (X_test[\"purchase_date_min\"]/X_test[\"nb_purchases\"]).replace([np.inf, -np.inf], 0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20c757fe-4f3a-406c-b3b9-dd12b57a474c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "e65af9b9-9266-4ec5-950f-2fc2ed14140c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0652202-f5bc-4141-a384-07afd96f146b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "7b3b3398-3ddc-41ee-b669-aea86e7f6d4e",
"metadata": {},
"source": [
"Il faut aussi étudier le nombre de tickets acheté, pas seulement le nombre d'achats"
]
},
{
"cell_type": "code",
"execution_count": 203,
"id": "3b01367d-4fb0-46bb-90e8-307e6152e8bb",
"metadata": {},
"outputs": [],
"source": [
"# on def avg tickets delay de façon similaire à avg purchase delay mais en utilisant plutôt nb tickets\n",
"\n",
"X_test[\"avg_tickets_delay\"] = (X_test[\"consumption_lifetime\"]/X_test[\"nb_tickets\"]).replace([np.inf, -np.inf], 0)\n",
"X_test[\"avg_tickets_delay_all\"] = (X_test[\"purchase_date_min\"]/X_test[\"nb_tickets\"]).replace([np.inf, -np.inf], 0)"
]
},
{
"cell_type": "code",
"execution_count": 204,
"id": "0eb59297-0ec2-4181-b743-0264f95a7bee",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>has_purchased</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.695913</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>1.294297</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.244205</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.279592</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>25.649026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.696135</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>1.299103</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.911844</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>363.061678</td>\n",
" <td>181.530839</td>\n",
" <td>239.346574</td>\n",
" <td>10.678285</td>\n",
" <td>14.079210</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>15.0</td>\n",
" <td>5.0</td>\n",
" <td>0.584680</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>12.0</td>\n",
" <td>9.0</td>\n",
" <td>0.654520</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>29.0</td>\n",
" <td>3.0</td>\n",
" <td>0.116503</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>20.0</td>\n",
" <td>4.0</td>\n",
" <td>0.579827</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>31.0</td>\n",
" <td>4.0</td>\n",
" <td>0.254002</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... nb_campaigns \\\n",
"0 0.0 True False ... 0.0 \n",
"1 0.0 True True ... 0.0 \n",
"2 0.0 True True ... 0.0 \n",
"3 0.0 True False ... 0.0 \n",
"4 0.0 True False ... 0.0 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 15.0 \n",
"96092 1.0 True False ... 12.0 \n",
"96093 0.0 True True ... 29.0 \n",
"96094 1.0 True False ... 20.0 \n",
"96095 0.0 True False ... 31.0 \n",
"\n",
" nb_campaigns_opened score quartile has_purchased \\\n",
"0 0.0 0.695913 3 0.0 \n",
"1 0.0 0.244205 1 1.0 \n",
"2 0.0 0.279592 2 0.0 \n",
"3 0.0 0.696135 3 0.0 \n",
"4 0.0 0.911844 4 1.0 \n",
"... ... ... ... ... \n",
"96091 5.0 0.584680 3 1.0 \n",
"96092 9.0 0.654520 3 0.0 \n",
"96093 3.0 0.116503 1 0.0 \n",
"96094 4.0 0.579827 3 0.0 \n",
"96095 4.0 0.254002 2 0.0 \n",
"\n",
" consumption_lifetime avg_purchase_delay avg_purchase_delay_all \\\n",
"0 0.000000 0.000000 5.177187 \n",
"1 0.000000 0.000000 426.265613 \n",
"2 0.000000 0.000000 436.033437 \n",
"3 0.000000 0.000000 5.196412 \n",
"4 363.061678 181.530839 239.346574 \n",
"... ... ... ... \n",
"96091 0.000000 0.000000 278.442257 \n",
"96092 0.000000 0.000000 189.207373 \n",
"96093 0.000000 NaN 0.000000 \n",
"96094 0.000000 0.000000 279.312905 \n",
"96095 0.000000 NaN 0.000000 \n",
"\n",
" avg_tickets_delay avg_tickets_delay_all \n",
"0 0.000000 1.294297 \n",
"1 0.000000 426.265613 \n",
"2 0.000000 25.649026 \n",
"3 0.000000 1.299103 \n",
"4 10.678285 14.079210 \n",
"... ... ... \n",
"96091 0.000000 278.442257 \n",
"96092 0.000000 189.207373 \n",
"96093 NaN 0.000000 \n",
"96094 0.000000 279.312905 \n",
"96095 NaN 0.000000 \n",
"\n",
"[96096 rows x 22 columns]"
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 206,
"id": "d6ef721a-dac6-49e0-8e1c-518a3cf79cbc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_purchases</th>\n",
" <th>nb_tickets</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>purchase_date_min</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" </tr>\n",
" <tr>\n",
" <th>quartile</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.000000</td>\n",
" <td>2.714286</td>\n",
" <td>0.597093</td>\n",
" <td>450.171815</td>\n",
" <td>0.298547</td>\n",
" <td>225.085907</td>\n",
" <td>0.198968</td>\n",
" <td>174.041855</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.592668</td>\n",
" <td>5.446707</td>\n",
" <td>26.192927</td>\n",
" <td>329.247848</td>\n",
" <td>11.435486</td>\n",
" <td>147.533946</td>\n",
" <td>5.992807</td>\n",
" <td>88.757091</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.203764</td>\n",
" <td>6.791530</td>\n",
" <td>64.785322</td>\n",
" <td>266.488673</td>\n",
" <td>25.490483</td>\n",
" <td>107.753468</td>\n",
" <td>14.307458</td>\n",
" <td>65.942338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>12.041836</td>\n",
" <td>46.274086</td>\n",
" <td>306.126700</td>\n",
" <td>391.637751</td>\n",
" <td>68.659817</td>\n",
" <td>92.058104</td>\n",
" <td>38.736644</td>\n",
" <td>53.575899</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_purchases nb_tickets consumption_lifetime purchase_date_min \\\n",
"quartile \n",
"1 2.000000 2.714286 0.597093 450.171815 \n",
"2 2.592668 5.446707 26.192927 329.247848 \n",
"3 3.203764 6.791530 64.785322 266.488673 \n",
"4 12.041836 46.274086 306.126700 391.637751 \n",
"\n",
" avg_purchase_delay avg_purchase_delay_all avg_tickets_delay \\\n",
"quartile \n",
"1 0.298547 225.085907 0.198968 \n",
"2 11.435486 147.533946 5.992807 \n",
"3 25.490483 107.753468 14.307458 \n",
"4 68.659817 92.058104 38.736644 \n",
"\n",
" avg_tickets_delay_all \n",
"quartile \n",
"1 174.041855 \n",
"2 88.757091 \n",
"3 65.942338 \n",
"4 53.575899 "
]
},
"execution_count": 206,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[X_test[\"avg_purchase_delay\"]>0].groupby(\"quartile\")[[\"nb_purchases\", \"nb_tickets\", \"consumption_lifetime\", \n",
" \"purchase_date_min\", \"avg_purchase_delay\", \n",
" \"avg_purchase_delay_all\", \"avg_tickets_delay\", \n",
" \"avg_tickets_delay_all\"]].mean()"
]
},
{
"cell_type": "markdown",
"id": "2ec816bf-852d-4fa7-a110-77d3e1b6f6a3",
"metadata": {},
"source": [
"Le délai moyen entre deux achats sur l'ensemble de la période"
]
},
{
"cell_type": "code",
"execution_count": 210,
"id": "8b57c418-31dc-4d0e-af80-304f4118a9e4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>has_purchased</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>3.0</td>\n",
" <td>3.0</td>\n",
" <td>35.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>219.530451</td>\n",
" <td>193.553044</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.387177</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>25.977407</td>\n",
" <td>8.659136</td>\n",
" <td>73.176817</td>\n",
" <td>8.659136</td>\n",
" <td>73.176817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>6.0</td>\n",
" <td>2.0</td>\n",
" <td>105.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>422.518935</td>\n",
" <td>422.474444</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.258480</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.044491</td>\n",
" <td>0.022245</td>\n",
" <td>211.259468</td>\n",
" <td>0.007415</td>\n",
" <td>70.419823</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>145.50</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>469.053773</td>\n",
" <td>337.012106</td>\n",
" <td>4.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.424641</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>132.041667</td>\n",
" <td>66.020833</td>\n",
" <td>234.526887</td>\n",
" <td>33.010417</td>\n",
" <td>117.263443</td>\n",
" </tr>\n",
" <tr>\n",
" <th>51</th>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" <td>276.00</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>317.012106</td>\n",
" <td>294.012106</td>\n",
" <td>4.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.353000</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>23.000000</td>\n",
" <td>5.750000</td>\n",
" <td>79.253027</td>\n",
" <td>5.750000</td>\n",
" <td>79.253027</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67</th>\n",
" <td>11.0</td>\n",
" <td>2.0</td>\n",
" <td>210.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>373.627303</td>\n",
" <td>255.476065</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.463581</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>118.151238</td>\n",
" <td>59.075619</td>\n",
" <td>186.813652</td>\n",
" <td>10.741022</td>\n",
" <td>33.966118</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71408</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>62.51</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>490.113715</td>\n",
" <td>489.507940</td>\n",
" <td>2.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>12.0</td>\n",
" <td>6.0</td>\n",
" <td>0.469953</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.605775</td>\n",
" <td>0.302888</td>\n",
" <td>245.056858</td>\n",
" <td>0.302888</td>\n",
" <td>245.056858</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71439</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>28.54</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>505.334005</td>\n",
" <td>505.324873</td>\n",
" <td>2.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>16.0</td>\n",
" <td>13.0</td>\n",
" <td>0.499401</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.009132</td>\n",
" <td>0.004566</td>\n",
" <td>252.667002</td>\n",
" <td>0.004566</td>\n",
" <td>252.667002</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74420</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>115.90</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>522.320521</td>\n",
" <td>522.318229</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>35.0</td>\n",
" <td>9.0</td>\n",
" <td>0.453181</td>\n",
" <td>2</td>\n",
" <td>1.0</td>\n",
" <td>0.002292</td>\n",
" <td>0.001146</td>\n",
" <td>261.160260</td>\n",
" <td>0.001146</td>\n",
" <td>261.160260</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79490</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>73.06</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>541.175509</td>\n",
" <td>521.153692</td>\n",
" <td>2.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>36.0</td>\n",
" <td>4.0</td>\n",
" <td>0.463122</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>20.021817</td>\n",
" <td>10.010909</td>\n",
" <td>270.587755</td>\n",
" <td>10.010909</td>\n",
" <td>270.587755</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89618</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>134.66</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>527.497685</td>\n",
" <td>506.694931</td>\n",
" <td>2.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>30.0</td>\n",
" <td>0.0</td>\n",
" <td>0.449862</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>20.802755</td>\n",
" <td>10.401377</td>\n",
" <td>263.748843</td>\n",
" <td>10.401377</td>\n",
" <td>263.748843</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1473 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"30 3.0 3.0 35.00 1.0 \n",
"37 6.0 2.0 105.00 1.0 \n",
"38 4.0 2.0 145.50 1.0 \n",
"51 4.0 4.0 276.00 1.0 \n",
"67 11.0 2.0 210.00 1.0 \n",
"... ... ... ... ... \n",
"71408 2.0 2.0 62.51 1.0 \n",
"71439 2.0 2.0 28.54 1.0 \n",
"74420 2.0 2.0 115.90 1.0 \n",
"79490 2.0 2.0 73.06 1.0 \n",
"89618 2.0 2.0 134.66 1.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"30 0.0 219.530451 193.553044 \n",
"37 0.0 422.518935 422.474444 \n",
"38 1.0 469.053773 337.012106 \n",
"51 1.0 317.012106 294.012106 \n",
"67 0.0 373.627303 255.476065 \n",
"... ... ... ... \n",
"71408 1.0 490.113715 489.507940 \n",
"71439 1.0 505.334005 505.324873 \n",
"74420 0.0 522.320521 522.318229 \n",
"79490 1.0 541.175509 521.153692 \n",
"89618 1.0 527.497685 506.694931 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... nb_campaigns \\\n",
"30 0.0 True True ... 0.0 \n",
"37 0.0 True True ... 0.0 \n",
"38 4.0 True True ... 0.0 \n",
"51 4.0 True True ... 0.0 \n",
"67 0.0 True True ... 0.0 \n",
"... ... ... ... ... ... \n",
"71408 2.0 True False ... 12.0 \n",
"71439 2.0 True False ... 16.0 \n",
"74420 0.0 True False ... 35.0 \n",
"79490 2.0 True False ... 36.0 \n",
"89618 2.0 True False ... 30.0 \n",
"\n",
" nb_campaigns_opened score quartile has_purchased \\\n",
"30 0.0 0.387177 2 1.0 \n",
"37 0.0 0.258480 2 0.0 \n",
"38 0.0 0.424641 2 1.0 \n",
"51 0.0 0.353000 2 0.0 \n",
"67 0.0 0.463581 2 0.0 \n",
"... ... ... ... ... \n",
"71408 6.0 0.469953 2 0.0 \n",
"71439 13.0 0.499401 2 0.0 \n",
"74420 9.0 0.453181 2 1.0 \n",
"79490 4.0 0.463122 2 0.0 \n",
"89618 0.0 0.449862 2 0.0 \n",
"\n",
" consumption_lifetime avg_purchase_delay avg_purchase_delay_all \\\n",
"30 25.977407 8.659136 73.176817 \n",
"37 0.044491 0.022245 211.259468 \n",
"38 132.041667 66.020833 234.526887 \n",
"51 23.000000 5.750000 79.253027 \n",
"67 118.151238 59.075619 186.813652 \n",
"... ... ... ... \n",
"71408 0.605775 0.302888 245.056858 \n",
"71439 0.009132 0.004566 252.667002 \n",
"74420 0.002292 0.001146 261.160260 \n",
"79490 20.021817 10.010909 270.587755 \n",
"89618 20.802755 10.401377 263.748843 \n",
"\n",
" avg_tickets_delay avg_tickets_delay_all \n",
"30 8.659136 73.176817 \n",
"37 0.007415 70.419823 \n",
"38 33.010417 117.263443 \n",
"51 5.750000 79.253027 \n",
"67 10.741022 33.966118 \n",
"... ... ... \n",
"71408 0.302888 245.056858 \n",
"71439 0.004566 252.667002 \n",
"74420 0.001146 261.160260 \n",
"79490 10.010909 270.587755 \n",
"89618 10.401377 263.748843 \n",
"\n",
"[1473 rows x 22 columns]"
]
},
"execution_count": 210,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[(X_test[\"avg_purchase_delay\"]>0) & (X_test[\"quartile\"]==\"2\")]"
]
},
{
"cell_type": "code",
"execution_count": 214,
"id": "5119ba18-9a89-4819-b98b-d0ae8e31291e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" <th>quartile</th>\n",
" <th>has_purchased</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>136</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>50.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.179282</td>\n",
" <td>4.441181</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690843</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.738102</td>\n",
" <td>0.369051</td>\n",
" <td>2.589641</td>\n",
" <td>0.369051</td>\n",
" <td>2.589641</td>\n",
" </tr>\n",
" <tr>\n",
" <th>187</th>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>117.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.190961</td>\n",
" <td>4.422014</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.694387</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.768947</td>\n",
" <td>0.384473</td>\n",
" <td>2.595480</td>\n",
" <td>0.256316</td>\n",
" <td>1.730320</td>\n",
" </tr>\n",
" <tr>\n",
" <th>229</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>196.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.144676</td>\n",
" <td>5.123021</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.697071</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.021655</td>\n",
" <td>0.010828</td>\n",
" <td>2.572338</td>\n",
" <td>0.005414</td>\n",
" <td>1.286169</td>\n",
" </tr>\n",
" <tr>\n",
" <th>312</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>200.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.180069</td>\n",
" <td>5.061979</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.697224</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.118090</td>\n",
" <td>0.059045</td>\n",
" <td>2.590035</td>\n",
" <td>0.029523</td>\n",
" <td>1.295017</td>\n",
" </tr>\n",
" <tr>\n",
" <th>439</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>156.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.167558</td>\n",
" <td>5.112234</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.696639</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.055324</td>\n",
" <td>0.027662</td>\n",
" <td>2.583779</td>\n",
" <td>0.013831</td>\n",
" <td>1.291889</td>\n",
" </tr>\n",
" <tr>\n",
" <th>613</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>156.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>3.285567</td>\n",
" <td>2.801887</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.478423</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.483681</td>\n",
" <td>0.241840</td>\n",
" <td>1.642784</td>\n",
" <td>0.120920</td>\n",
" <td>0.821392</td>\n",
" </tr>\n",
" <tr>\n",
" <th>713</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>100.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.183241</td>\n",
" <td>5.060972</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.696068</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.122269</td>\n",
" <td>0.061134</td>\n",
" <td>2.591620</td>\n",
" <td>0.030567</td>\n",
" <td>1.295810</td>\n",
" </tr>\n",
" <tr>\n",
" <th>967</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>60.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>3.251076</td>\n",
" <td>3.127894</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.691127</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.123183</td>\n",
" <td>0.061591</td>\n",
" <td>1.625538</td>\n",
" <td>0.061591</td>\n",
" <td>1.625538</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1042</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>106.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.140903</td>\n",
" <td>5.133646</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690563</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.007257</td>\n",
" <td>0.003628</td>\n",
" <td>2.570451</td>\n",
" <td>0.003628</td>\n",
" <td>2.570451</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1096</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>110.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.190764</td>\n",
" <td>4.646551</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.696727</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.544213</td>\n",
" <td>0.272106</td>\n",
" <td>2.595382</td>\n",
" <td>0.136053</td>\n",
" <td>1.297691</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1124</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>50.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.192303</td>\n",
" <td>5.144618</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.689933</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>0.047685</td>\n",
" <td>0.023843</td>\n",
" <td>2.596152</td>\n",
" <td>0.023843</td>\n",
" <td>2.596152</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1451</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>60.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.126782</td>\n",
" <td>5.118449</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690032</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.008333</td>\n",
" <td>0.004167</td>\n",
" <td>2.563391</td>\n",
" <td>0.004167</td>\n",
" <td>2.563391</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1728</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>100.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.137326</td>\n",
" <td>4.958299</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.696165</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.179028</td>\n",
" <td>0.089514</td>\n",
" <td>2.568663</td>\n",
" <td>0.044757</td>\n",
" <td>1.284332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1740</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>60.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.183495</td>\n",
" <td>5.176933</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690001</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.006563</td>\n",
" <td>0.003281</td>\n",
" <td>2.591748</td>\n",
" <td>0.003281</td>\n",
" <td>2.591748</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1843</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>102.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.184803</td>\n",
" <td>5.180162</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690491</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.004641</td>\n",
" <td>0.002321</td>\n",
" <td>2.592402</td>\n",
" <td>0.002321</td>\n",
" <td>2.592402</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1862</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>106.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.197801</td>\n",
" <td>5.191470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690534</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.006331</td>\n",
" <td>0.003166</td>\n",
" <td>2.598900</td>\n",
" <td>0.003166</td>\n",
" <td>2.598900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1984</th>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>88.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.189468</td>\n",
" <td>5.182257</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690328</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.007211</td>\n",
" <td>0.003605</td>\n",
" <td>2.594734</td>\n",
" <td>0.003605</td>\n",
" <td>2.594734</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2041</th>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>147.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>4.597095</td>\n",
" <td>4.373079</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.694326</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.224016</td>\n",
" <td>0.112008</td>\n",
" <td>2.298547</td>\n",
" <td>0.074672</td>\n",
" <td>1.532365</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2115</th>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>75.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.182986</td>\n",
" <td>5.129433</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.692971</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.053553</td>\n",
" <td>0.026777</td>\n",
" <td>2.591493</td>\n",
" <td>0.017851</td>\n",
" <td>1.727662</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2384</th>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>196.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.171771</td>\n",
" <td>4.604873</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.697762</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.566898</td>\n",
" <td>0.283449</td>\n",
" <td>2.585885</td>\n",
" <td>0.141725</td>\n",
" <td>1.292943</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>20 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"136 2.0 2.0 50.0 1.0 \n",
"187 3.0 2.0 117.0 1.0 \n",
"229 4.0 2.0 196.0 1.0 \n",
"312 4.0 2.0 200.0 1.0 \n",
"439 4.0 2.0 156.0 1.0 \n",
"613 4.0 2.0 156.0 1.0 \n",
"713 4.0 2.0 100.0 1.0 \n",
"967 2.0 2.0 60.0 1.0 \n",
"1042 2.0 2.0 106.0 1.0 \n",
"1096 4.0 2.0 110.0 1.0 \n",
"1124 2.0 2.0 50.0 1.0 \n",
"1451 2.0 2.0 60.0 1.0 \n",
"1728 4.0 2.0 100.0 1.0 \n",
"1740 2.0 2.0 60.0 1.0 \n",
"1843 2.0 2.0 102.0 1.0 \n",
"1862 2.0 2.0 106.0 1.0 \n",
"1984 2.0 2.0 88.0 1.0 \n",
"2041 3.0 2.0 147.0 1.0 \n",
"2115 3.0 2.0 75.0 1.0 \n",
"2384 4.0 2.0 196.0 1.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"136 0.0 5.179282 4.441181 \n",
"187 0.0 5.190961 4.422014 \n",
"229 0.0 5.144676 5.123021 \n",
"312 0.0 5.180069 5.061979 \n",
"439 0.0 5.167558 5.112234 \n",
"613 0.0 3.285567 2.801887 \n",
"713 0.0 5.183241 5.060972 \n",
"967 0.0 3.251076 3.127894 \n",
"1042 0.0 5.140903 5.133646 \n",
"1096 0.0 5.190764 4.646551 \n",
"1124 0.0 5.192303 5.144618 \n",
"1451 0.0 5.126782 5.118449 \n",
"1728 0.0 5.137326 4.958299 \n",
"1740 0.0 5.183495 5.176933 \n",
"1843 0.0 5.184803 5.180162 \n",
"1862 0.0 5.197801 5.191470 \n",
"1984 0.0 5.189468 5.182257 \n",
"2041 0.0 4.597095 4.373079 \n",
"2115 0.0 5.182986 5.129433 \n",
"2384 0.0 5.171771 4.604873 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... nb_campaigns \\\n",
"136 0.0 True False ... 0.0 \n",
"187 0.0 True False ... 0.0 \n",
"229 0.0 True False ... 0.0 \n",
"312 0.0 True False ... 0.0 \n",
"439 0.0 True False ... 0.0 \n",
"613 0.0 True True ... 0.0 \n",
"713 0.0 True False ... 0.0 \n",
"967 0.0 True False ... 0.0 \n",
"1042 0.0 True False ... 0.0 \n",
"1096 0.0 True False ... 0.0 \n",
"1124 0.0 True False ... 0.0 \n",
"1451 0.0 True False ... 0.0 \n",
"1728 0.0 True False ... 0.0 \n",
"1740 0.0 True False ... 0.0 \n",
"1843 0.0 True False ... 0.0 \n",
"1862 0.0 True False ... 0.0 \n",
"1984 0.0 True False ... 0.0 \n",
"2041 0.0 True False ... 0.0 \n",
"2115 0.0 True False ... 0.0 \n",
"2384 0.0 True False ... 0.0 \n",
"\n",
" nb_campaigns_opened score quartile has_purchased \\\n",
"136 0.0 0.690843 3 0.0 \n",
"187 0.0 0.694387 3 0.0 \n",
"229 0.0 0.697071 3 0.0 \n",
"312 0.0 0.697224 3 0.0 \n",
"439 0.0 0.696639 3 0.0 \n",
"613 0.0 0.478423 2 0.0 \n",
"713 0.0 0.696068 3 0.0 \n",
"967 0.0 0.691127 3 0.0 \n",
"1042 0.0 0.690563 3 0.0 \n",
"1096 0.0 0.696727 3 0.0 \n",
"1124 0.0 0.689933 3 1.0 \n",
"1451 0.0 0.690032 3 0.0 \n",
"1728 0.0 0.696165 3 0.0 \n",
"1740 0.0 0.690001 3 0.0 \n",
"1843 0.0 0.690491 3 0.0 \n",
"1862 0.0 0.690534 3 0.0 \n",
"1984 0.0 0.690328 3 0.0 \n",
"2041 0.0 0.694326 3 0.0 \n",
"2115 0.0 0.692971 3 0.0 \n",
"2384 0.0 0.697762 3 0.0 \n",
"\n",
" consumption_lifetime avg_purchase_delay avg_purchase_delay_all \\\n",
"136 0.738102 0.369051 2.589641 \n",
"187 0.768947 0.384473 2.595480 \n",
"229 0.021655 0.010828 2.572338 \n",
"312 0.118090 0.059045 2.590035 \n",
"439 0.055324 0.027662 2.583779 \n",
"613 0.483681 0.241840 1.642784 \n",
"713 0.122269 0.061134 2.591620 \n",
"967 0.123183 0.061591 1.625538 \n",
"1042 0.007257 0.003628 2.570451 \n",
"1096 0.544213 0.272106 2.595382 \n",
"1124 0.047685 0.023843 2.596152 \n",
"1451 0.008333 0.004167 2.563391 \n",
"1728 0.179028 0.089514 2.568663 \n",
"1740 0.006563 0.003281 2.591748 \n",
"1843 0.004641 0.002321 2.592402 \n",
"1862 0.006331 0.003166 2.598900 \n",
"1984 0.007211 0.003605 2.594734 \n",
"2041 0.224016 0.112008 2.298547 \n",
"2115 0.053553 0.026777 2.591493 \n",
"2384 0.566898 0.283449 2.585885 \n",
"\n",
" avg_tickets_delay avg_tickets_delay_all \n",
"136 0.369051 2.589641 \n",
"187 0.256316 1.730320 \n",
"229 0.005414 1.286169 \n",
"312 0.029523 1.295017 \n",
"439 0.013831 1.291889 \n",
"613 0.120920 0.821392 \n",
"713 0.030567 1.295810 \n",
"967 0.061591 1.625538 \n",
"1042 0.003628 2.570451 \n",
"1096 0.136053 1.297691 \n",
"1124 0.023843 2.596152 \n",
"1451 0.004167 2.563391 \n",
"1728 0.044757 1.284332 \n",
"1740 0.003281 2.591748 \n",
"1843 0.002321 2.592402 \n",
"1862 0.003166 2.598900 \n",
"1984 0.003605 2.594734 \n",
"2041 0.074672 1.532365 \n",
"2115 0.017851 1.727662 \n",
"2384 0.141725 1.292943 \n",
"\n",
"[20 rows x 22 columns]"
]
},
"execution_count": 214,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[(X_test[\"avg_purchase_delay\"]>0) & (X_test[\"purchase_date_min\"]<10)].head(20)"
]
},
{
"cell_type": "code",
"execution_count": 217,
"id": "91ec6a21-89dd-40cd-91fc-8dfab132a9e8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"y_has_purchased 13690.0\n",
"dtype: float64"
]
},
"execution_count": 217,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test.sum()"
]
},
{
"cell_type": "code",
"execution_count": 218,
"id": "3223968c-409e-4110-8dcc-fe319d34d44f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"36092.22480054577"
]
},
"execution_count": 218,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[\"score\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": 241,
"id": "0233ab78-81d7-41a2-b948-4bc24f51c9e9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.20933232507450736"
]
},
"execution_count": 241,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test[X_test[\"quartile\"]==\"3\"][\"has_purchased\"].mean()"
]
},
{
"cell_type": "markdown",
"id": "c3bf1a55-7d46-42c7-9436-b68ce8c7ef24",
"metadata": {},
"source": [
"Autre méthode \\\n",
"On considère la durée totale sur laquelle les features ont été observées (1 an et demi) sans se soucier de la \n",
"date du 1er achat. \n",
"Et on extrapole le rythme d'achat en considérant que le client devrait acheter nb_tickets/1.5 tickets durant l'année à venir. "
]
},
{
"cell_type": "code",
"execution_count": 240,
"id": "d594a3ee-22cb-45b5-a6fa-4439c0aad01c",
"metadata": {},
"outputs": [],
"source": [
"period_duration_years = 1.5\n",
"\n",
"expected_tickets_purchased = X_test[\"nb_tickets\"]/period_duration_years\n",
"expected_amount = X_test[\"total_amount\"]/period_duration_years"
]
},
{
"cell_type": "code",
"execution_count": 297,
"id": "807f9810-a691-4e51-af51-cdb7f0b4bd40",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>quartile</th>\n",
" <th>has_purchased</th>\n",
" <th>consumption_lifetime</th>\n",
" <th>avg_purchase_delay</th>\n",
" <th>avg_purchase_delay_all</th>\n",
" <th>avg_tickets_delay</th>\n",
" <th>avg_tickets_delay_all</th>\n",
" <th>decile</th>\n",
" <th>overshoot_coeff</th>\n",
" <th>ajusted_score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.177187</td>\n",
" <td>0.000000</td>\n",
" <td>1.294297</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211260</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>0.000000</td>\n",
" <td>426.265613</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.063821</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>436.033437</td>\n",
" <td>0.000000</td>\n",
" <td>25.649026</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.073069</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>5.196412</td>\n",
" <td>0.000000</td>\n",
" <td>1.299103</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>363.061678</td>\n",
" <td>181.530839</td>\n",
" <td>239.346574</td>\n",
" <td>10.678285</td>\n",
" <td>14.079210</td>\n",
" <td>9</td>\n",
" <td>1.268598</td>\n",
" <td>0.718781</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>0.000000</td>\n",
" <td>278.442257</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.179296</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>0.000000</td>\n",
" <td>189.207373</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.198694</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>1</td>\n",
" <td>17.863019</td>\n",
" <td>0.006522</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>0.000000</td>\n",
" <td>279.312905</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.177808</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.066382</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 25 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... quartile \\\n",
"0 0.0 True False ... 3 \n",
"1 0.0 True True ... 1 \n",
"2 0.0 True True ... 2 \n",
"3 0.0 True False ... 3 \n",
"4 0.0 True False ... 4 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 3 \n",
"96092 1.0 True False ... 3 \n",
"96093 0.0 True True ... 1 \n",
"96094 1.0 True False ... 3 \n",
"96095 0.0 True False ... 2 \n",
"\n",
" has_purchased consumption_lifetime avg_purchase_delay \\\n",
"0 0.0 0.000000 0.000000 \n",
"1 1.0 0.000000 0.000000 \n",
"2 0.0 0.000000 0.000000 \n",
"3 0.0 0.000000 0.000000 \n",
"4 1.0 363.061678 181.530839 \n",
"... ... ... ... \n",
"96091 1.0 0.000000 0.000000 \n",
"96092 0.0 0.000000 0.000000 \n",
"96093 0.0 0.000000 NaN \n",
"96094 0.0 0.000000 0.000000 \n",
"96095 0.0 0.000000 NaN \n",
"\n",
" avg_purchase_delay_all avg_tickets_delay avg_tickets_delay_all \\\n",
"0 5.177187 0.000000 1.294297 \n",
"1 426.265613 0.000000 426.265613 \n",
"2 436.033437 0.000000 25.649026 \n",
"3 5.196412 0.000000 1.299103 \n",
"4 239.346574 10.678285 14.079210 \n",
"... ... ... ... \n",
"96091 278.442257 0.000000 278.442257 \n",
"96092 189.207373 0.000000 189.207373 \n",
"96093 0.000000 NaN 0.000000 \n",
"96094 279.312905 0.000000 279.312905 \n",
"96095 0.000000 NaN 0.000000 \n",
"\n",
" decile overshoot_coeff ajusted_score \n",
"0 6 3.294104 0.211260 \n",
"1 2 3.826401 0.063821 \n",
"2 2 3.826401 0.073069 \n",
"3 6 3.294104 0.211328 \n",
"4 9 1.268598 0.718781 \n",
"... ... ... ... \n",
"96091 5 3.260982 0.179296 \n",
"96092 6 3.294104 0.198694 \n",
"96093 1 17.863019 0.006522 \n",
"96094 5 3.260982 0.177808 \n",
"96095 2 3.826401 0.066382 \n",
"\n",
"[96096 rows x 25 columns]"
]
},
"execution_count": 297,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "markdown",
"id": "ab7489e3-58e8-4be8-b870-60c869ba7953",
"metadata": {},
"source": [
"Estimation de l'overshoot : méthode plus rigoureuse \n",
"\n",
"on étudie le rapport entre le score et has purchased\n",
"plus exactement entre score/(1-score) et has_purchased/(1-has_purchased) - permet de coller à structure du logit"
]
},
{
"cell_type": "code",
"execution_count": 301,
"id": "3587dd1d-73a7-4810-9330-4b29caeb1e9f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
"1 0.203706\n",
"2 0.564483\n",
"3 1.679424\n",
"4 9.209851\n",
"Name: score, dtype: float64"
]
},
"execution_count": 301,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"score_odd_ratio_quartile = X_test.groupby(\"quartile\")[\"score\"].mean()/(1-X_test.groupby(\"quartile\")[\"score\"].mean())\n",
"score_odd_ratio_quartile"
]
},
{
"cell_type": "code",
"execution_count": 302,
"id": "1a7dcc8c-33c5-4abf-828f-ba17dceb3287",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"quartile\n",
"1 0.027517\n",
"2 0.133083\n",
"3 0.264754\n",
"4 1.998944\n",
"Name: has_purchased, dtype: float64"
]
},
"execution_count": 302,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_odd_ratio_quartile = X_test.groupby(\"quartile\")[\"has_purchased\"].mean()/(1-X_test.groupby(\"quartile\")[\"has_purchased\"].mean())\n",
"y_odd_ratio_quartile"
]
},
{
"cell_type": "markdown",
"id": "6307f5f8-3597-422b-86ef-cdcac3648862",
"metadata": {},
"source": [
"### PB : a-t-on le même résultat de calcul du biais sur X_train et y_train ?"
]
},
{
"cell_type": "code",
"execution_count": 478,
"id": "c857531d-3002-4047-b206-a31cc11c451c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>...</th>\n",
" <th>decile</th>\n",
" <th>overshoot_coeff</th>\n",
" <th>ajusted_score</th>\n",
" <th>odd_ratio</th>\n",
" <th>test_adjusted_score_2</th>\n",
" <th>score_adjusted</th>\n",
" <th>nb_tickets_projected</th>\n",
" <th>total_amount_projected</th>\n",
" <th>nb_tickets_expected</th>\n",
" <th>total_amount_expected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>100.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.177187</td>\n",
" <td>5.177187</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211260</td>\n",
" <td>2.288530</td>\n",
" <td>0.533640</td>\n",
" <td>0.274689</td>\n",
" <td>2.666667</td>\n",
" <td>66.666667</td>\n",
" <td>0.732503</td>\n",
" <td>18.312587</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>55.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>426.265613</td>\n",
" <td>426.265613</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.063821</td>\n",
" <td>0.323109</td>\n",
" <td>0.139085</td>\n",
" <td>0.050756</td>\n",
" <td>0.666667</td>\n",
" <td>36.666667</td>\n",
" <td>0.033837</td>\n",
" <td>1.861053</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>17.0</td>\n",
" <td>1.0</td>\n",
" <td>80.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>436.033437</td>\n",
" <td>436.033437</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.073069</td>\n",
" <td>0.388102</td>\n",
" <td>0.162515</td>\n",
" <td>0.060349</td>\n",
" <td>11.333333</td>\n",
" <td>53.333333</td>\n",
" <td>0.683958</td>\n",
" <td>3.218627</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>120.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.196412</td>\n",
" <td>5.196412</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.211328</td>\n",
" <td>2.290940</td>\n",
" <td>0.533902</td>\n",
" <td>0.274899</td>\n",
" <td>2.666667</td>\n",
" <td>80.000000</td>\n",
" <td>0.733063</td>\n",
" <td>21.991884</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>416.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>478.693148</td>\n",
" <td>115.631470</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>9</td>\n",
" <td>1.268598</td>\n",
" <td>0.718781</td>\n",
" <td>10.343538</td>\n",
" <td>0.837972</td>\n",
" <td>0.631228</td>\n",
" <td>22.666667</td>\n",
" <td>277.333333</td>\n",
" <td>14.307843</td>\n",
" <td>175.060667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96091</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>67.31</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>278.442257</td>\n",
" <td>278.442257</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.179296</td>\n",
" <td>1.407779</td>\n",
" <td>0.413108</td>\n",
" <td>0.188948</td>\n",
" <td>0.666667</td>\n",
" <td>44.873333</td>\n",
" <td>0.125966</td>\n",
" <td>8.478740</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96092</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>61.41</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>189.207373</td>\n",
" <td>189.207373</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>6</td>\n",
" <td>3.294104</td>\n",
" <td>0.198694</td>\n",
" <td>1.894523</td>\n",
" <td>0.486458</td>\n",
" <td>0.238685</td>\n",
" <td>0.666667</td>\n",
" <td>40.940000</td>\n",
" <td>0.159123</td>\n",
" <td>9.771748</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96093</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>17.863019</td>\n",
" <td>0.006522</td>\n",
" <td>0.131865</td>\n",
" <td>0.061854</td>\n",
" <td>0.021356</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96094</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>79.43</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>279.312905</td>\n",
" <td>279.312905</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>5</td>\n",
" <td>3.260982</td>\n",
" <td>0.177808</td>\n",
" <td>1.379973</td>\n",
" <td>0.408279</td>\n",
" <td>0.185910</td>\n",
" <td>0.666667</td>\n",
" <td>52.953333</td>\n",
" <td>0.123940</td>\n",
" <td>9.844555</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96095</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>3.826401</td>\n",
" <td>0.066382</td>\n",
" <td>0.340487</td>\n",
" <td>0.145477</td>\n",
" <td>0.053340</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>96096 rows × 32 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in ... decile \\\n",
"0 0.0 True False ... 6 \n",
"1 0.0 True True ... 2 \n",
"2 0.0 True True ... 2 \n",
"3 0.0 True False ... 6 \n",
"4 0.0 True False ... 9 \n",
"... ... ... ... ... ... \n",
"96091 1.0 True False ... 5 \n",
"96092 1.0 True False ... 6 \n",
"96093 0.0 True True ... 1 \n",
"96094 1.0 True False ... 5 \n",
"96095 0.0 True False ... 2 \n",
"\n",
" overshoot_coeff ajusted_score odd_ratio test_adjusted_score_2 \\\n",
"0 3.294104 0.211260 2.288530 0.533640 \n",
"1 3.826401 0.063821 0.323109 0.139085 \n",
"2 3.826401 0.073069 0.388102 0.162515 \n",
"3 3.294104 0.211328 2.290940 0.533902 \n",
"4 1.268598 0.718781 10.343538 0.837972 \n",
"... ... ... ... ... \n",
"96091 3.260982 0.179296 1.407779 0.413108 \n",
"96092 3.294104 0.198694 1.894523 0.486458 \n",
"96093 17.863019 0.006522 0.131865 0.061854 \n",
"96094 3.260982 0.177808 1.379973 0.408279 \n",
"96095 3.826401 0.066382 0.340487 0.145477 \n",
"\n",
" score_adjusted nb_tickets_projected total_amount_projected \\\n",
"0 0.274689 2.666667 66.666667 \n",
"1 0.050756 0.666667 36.666667 \n",
"2 0.060349 11.333333 53.333333 \n",
"3 0.274899 2.666667 80.000000 \n",
"4 0.631228 22.666667 277.333333 \n",
"... ... ... ... \n",
"96091 0.188948 0.666667 44.873333 \n",
"96092 0.238685 0.666667 40.940000 \n",
"96093 0.021356 0.000000 0.000000 \n",
"96094 0.185910 0.666667 52.953333 \n",
"96095 0.053340 0.000000 0.000000 \n",
"\n",
" nb_tickets_expected total_amount_expected \n",
"0 0.732503 18.312587 \n",
"1 0.033837 1.861053 \n",
"2 0.683958 3.218627 \n",
"3 0.733063 21.991884 \n",
"4 14.307843 175.060667 \n",
"... ... ... \n",
"96091 0.125966 8.478740 \n",
"96092 0.159123 9.771748 \n",
"96093 0.000000 0.000000 \n",
"96094 0.123940 9.844555 \n",
"96095 0.000000 0.000000 \n",
"\n",
"[96096 rows x 32 columns]"
]
},
"execution_count": 478,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test"
]
},
{
"cell_type": "code",
"execution_count": 479,
"id": "af371c21-a121-41ce-92a2-e01bdac8ad81",
"metadata": {},
"outputs": [],
"source": [
"y_pred_prob_train = logit_grid.predict_proba(X_train)[:, 1]\n"
]
},
{
"cell_type": "code",
"execution_count": 484,
"id": "1e1ddbe4-037a-4866-ae35-161e6ba14ffd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"somme des scores calculés sur X train : 84127.81461345348\n",
"somme des y train : 32154.0\n"
]
}
],
"source": [
"# globalement, on a toujours une somme de scores 3 fois supérieure (même si le biais semble atténué)\n",
"print(\"somme des scores calculés sur X train : \",y_pred_prob_train.sum())\n",
"print(\"somme des y train : \", y_train.sum()[\"y_has_purchased\"])"
]
},
{
"cell_type": "code",
"execution_count": 493,
"id": "ff61821b-b643-4002-88d8-8a0ec1268e73",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" <th>odd_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>60.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>355.268981</td>\n",
" <td>355.268981</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.493834</td>\n",
" <td>0.975638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8.0</td>\n",
" <td>3.0</td>\n",
" <td>140.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>373.540289</td>\n",
" <td>219.262269</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.722704</td>\n",
" <td>2.606253</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>50.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.202442</td>\n",
" <td>5.202442</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.689866</td>\n",
" <td>2.224409</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>90.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.178958</td>\n",
" <td>5.178958</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.693078</td>\n",
" <td>2.258158</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>78.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.174039</td>\n",
" <td>5.174039</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690209</td>\n",
" <td>2.227980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224208</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>34.0</td>\n",
" <td>3.0</td>\n",
" <td>0.250218</td>\n",
" <td>0.333721</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224209</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>20.00</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>392.501030</td>\n",
" <td>392.501030</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>23.0</td>\n",
" <td>6.0</td>\n",
" <td>0.524745</td>\n",
" <td>1.104135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224210</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8.0</td>\n",
" <td>4.0</td>\n",
" <td>0.117175</td>\n",
" <td>0.132728</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224211</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>97.11</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>172.334074</td>\n",
" <td>172.334074</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>13.0</td>\n",
" <td>5.0</td>\n",
" <td>0.643851</td>\n",
" <td>1.807814</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224212</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" <td>0.250170</td>\n",
" <td>0.333636</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>224213 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 2.0 1.0 60.00 1.0 \n",
"1 8.0 3.0 140.00 1.0 \n",
"2 2.0 1.0 50.00 1.0 \n",
"3 3.0 1.0 90.00 1.0 \n",
"4 2.0 1.0 78.00 1.0 \n",
"... ... ... ... ... \n",
"224208 0.0 0.0 0.00 0.0 \n",
"224209 1.0 1.0 20.00 1.0 \n",
"224210 0.0 0.0 0.00 0.0 \n",
"224211 1.0 1.0 97.11 1.0 \n",
"224212 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 355.268981 355.268981 \n",
"1 0.0 373.540289 219.262269 \n",
"2 0.0 5.202442 5.202442 \n",
"3 0.0 5.178958 5.178958 \n",
"4 0.0 5.174039 5.174039 \n",
"... ... ... ... \n",
"224208 0.0 550.000000 550.000000 \n",
"224209 1.0 392.501030 392.501030 \n",
"224210 0.0 550.000000 550.000000 \n",
"224211 1.0 172.334074 172.334074 \n",
"224212 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"0 0.0 True False 0 \n",
"1 0.0 True False 0 \n",
"2 0.0 True False 0 \n",
"3 0.0 True False 0 \n",
"4 0.0 True False 1 \n",
"... ... ... ... ... \n",
"224208 0.0 True False 0 \n",
"224209 1.0 True False 0 \n",
"224210 0.0 True True 0 \n",
"224211 1.0 True False 0 \n",
"224212 0.0 True False 0 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened score odd_ratio \n",
"0 1 0.0 0.0 0.493834 0.975638 \n",
"1 1 0.0 0.0 0.722704 2.606253 \n",
"2 1 0.0 0.0 0.689866 2.224409 \n",
"3 1 0.0 0.0 0.693078 2.258158 \n",
"4 0 0.0 0.0 0.690209 2.227980 \n",
"... ... ... ... ... ... \n",
"224208 1 34.0 3.0 0.250218 0.333721 \n",
"224209 1 23.0 6.0 0.524745 1.104135 \n",
"224210 1 8.0 4.0 0.117175 0.132728 \n",
"224211 1 13.0 5.0 0.643851 1.807814 \n",
"224212 1 4.0 4.0 0.250170 0.333636 \n",
"\n",
"[224213 rows x 16 columns]"
]
},
"execution_count": 493,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train[\"score\"] = y_pred_prob_train\n",
"# X_train[\"odd_ratio\"] = X_train[\"score\"]/(1-X_train[\"score\"])\n",
"X_train"
]
},
{
"cell_type": "code",
"execution_count": 491,
"id": "240afa08-692d-4c2d-93c7-c8c8a46afdb3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 2.241790e+05\n",
"mean 5.824134e+10\n",
"std 1.462083e+13\n",
"min 1.207494e-01\n",
"25% 1.476621e-01\n",
"50% 3.338869e-01\n",
"75% 1.427047e+00\n",
"max 4.503600e+15\n",
"Name: odd_ratio, dtype: float64"
]
},
"execution_count": 491,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train[\"odd_ratio\"][X_train[\"odd_ratio\"]<np.inf].describe()"
]
},
{
"cell_type": "code",
"execution_count": 494,
"id": "863ff04a-c4de-44cd-af9d-1e5032624592",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>score</th>\n",
" <th>odd_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>60.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>355.268981</td>\n",
" <td>355.268981</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.493834</td>\n",
" <td>0.975638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8.0</td>\n",
" <td>3.0</td>\n",
" <td>140.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>373.540289</td>\n",
" <td>219.262269</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.722704</td>\n",
" <td>2.606253</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>50.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.202442</td>\n",
" <td>5.202442</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.689866</td>\n",
" <td>2.224409</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>90.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.178958</td>\n",
" <td>5.178958</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.693078</td>\n",
" <td>2.258158</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>78.00</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>5.174039</td>\n",
" <td>5.174039</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.690209</td>\n",
" <td>2.227980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224208</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>34.0</td>\n",
" <td>3.0</td>\n",
" <td>0.250218</td>\n",
" <td>0.333721</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224209</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>20.00</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>392.501030</td>\n",
" <td>392.501030</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>23.0</td>\n",
" <td>6.0</td>\n",
" <td>0.524745</td>\n",
" <td>1.104135</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224210</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>8.0</td>\n",
" <td>4.0</td>\n",
" <td>0.117175</td>\n",
" <td>0.132728</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224211</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>97.11</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>172.334074</td>\n",
" <td>172.334074</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>13.0</td>\n",
" <td>5.0</td>\n",
" <td>0.643851</td>\n",
" <td>1.807814</td>\n",
" </tr>\n",
" <tr>\n",
" <th>224212</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" <td>0.250170</td>\n",
" <td>0.333636</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>224213 rows × 16 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 2.0 1.0 60.00 1.0 \n",
"1 8.0 3.0 140.00 1.0 \n",
"2 2.0 1.0 50.00 1.0 \n",
"3 3.0 1.0 90.00 1.0 \n",
"4 2.0 1.0 78.00 1.0 \n",
"... ... ... ... ... \n",
"224208 0.0 0.0 0.00 0.0 \n",
"224209 1.0 1.0 20.00 1.0 \n",
"224210 0.0 0.0 0.00 0.0 \n",
"224211 1.0 1.0 97.11 1.0 \n",
"224212 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 355.268981 355.268981 \n",
"1 0.0 373.540289 219.262269 \n",
"2 0.0 5.202442 5.202442 \n",
"3 0.0 5.178958 5.178958 \n",
"4 0.0 5.174039 5.174039 \n",
"... ... ... ... \n",
"224208 0.0 550.000000 550.000000 \n",
"224209 1.0 392.501030 392.501030 \n",
"224210 0.0 550.000000 550.000000 \n",
"224211 1.0 172.334074 172.334074 \n",
"224212 0.0 550.000000 550.000000 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"0 0.0 True False 0 \n",
"1 0.0 True False 0 \n",
"2 0.0 True False 0 \n",
"3 0.0 True False 0 \n",
"4 0.0 True False 1 \n",
"... ... ... ... ... \n",
"224208 0.0 True False 0 \n",
"224209 1.0 True False 0 \n",
"224210 0.0 True True 0 \n",
"224211 1.0 True False 0 \n",
"224212 0.0 True False 0 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened score odd_ratio \n",
"0 1 0.0 0.0 0.493834 0.975638 \n",
"1 1 0.0 0.0 0.722704 2.606253 \n",
"2 1 0.0 0.0 0.689866 2.224409 \n",
"3 1 0.0 0.0 0.693078 2.258158 \n",
"4 0 0.0 0.0 0.690209 2.227980 \n",
"... ... ... ... ... ... \n",
"224208 1 34.0 3.0 0.250218 0.333721 \n",
"224209 1 23.0 6.0 0.524745 1.104135 \n",
"224210 1 8.0 4.0 0.117175 0.132728 \n",
"224211 1 13.0 5.0 0.643851 1.807814 \n",
"224212 1 4.0 4.0 0.250170 0.333636 \n",
"\n",
"[224213 rows x 16 columns]"
]
},
"execution_count": 494,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# on utilise le second score comme valeur de remplacement quand score = 1\n",
"X_train_second_score = X_train[\"score\"][X_train[\"score\"]<1].max()\n",
"\n",
"X_train[\"score\"] = X_train[\"score\"].apply(lambda x : X_train_second_score if x==1 else x)\n",
"X_train"
]
},
{
"cell_type": "code",
"execution_count": 498,
"id": "b2690332-9f2e-4597-ab13-cef073de367f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9999999999999998"
]
},
"execution_count": 498,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train[\"score\"].max()"
]
},
{
"cell_type": "code",
"execution_count": 499,
"id": "e749e3b5-f5f9-4ab5-a0c1-ee99c5e88a26",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 2.242130e+05\n",
"mean 7.411652e+11\n",
"std 5.734858e+13\n",
"min 1.207494e-01\n",
"25% 1.476621e-01\n",
"50% 3.338869e-01\n",
"75% 1.427525e+00\n",
"max 4.503600e+15\n",
"Name: odd_ratio, dtype: float64"
]
},
"execution_count": 499,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train[\"odd_ratio\"] = X_train[\"score\"]/(1-X_train[\"score\"])\n",
"X_train[\"odd_ratio\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 500,
"id": "84fea40a-896f-4e74-8d3c-18ecbe9f4c5f",
"metadata": {},
"outputs": [],
"source": [
"def obj_function_X_train(bias) :\n",
" obj = sum([adjusted_score(element, bias) for element in X_train[\"odd_ratio\"]]) # - y_test.sum()[\"y_has_purchased\"]\n",
" return obj"
]
},
{
"cell_type": "code",
"execution_count": 501,
"id": "9886995b-59d7-4fdf-acb0-981338a4e083",
"metadata": {},
"outputs": [],
"source": [
"# minimization\n",
"\n",
"from scipy.optimize import minimize\n",
"\n",
"\n",
"y_train_sum = y_train.sum()[\"y_has_purchased\"]\n",
"initial_guess = 6\n",
"estimated_biais_train = minimize(lambda bias : (obj_function_X_train(bias)-y_train_sum)**2 ,\n",
"initial_guess , method = \"BFGS\")"
]
},
{
"cell_type": "code",
"execution_count": 502,
"id": "80cb872f-2aac-4c77-b935-2d05e0199837",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bias estimated on train set: 5.947447991192572\n"
]
}
],
"source": [
"# biais de 5.95 contre 6.04 pour le test set, OK\n",
"print(f\"bias estimated on train set: {estimated_biais_train.x[0]}\")"
]
},
{
"cell_type": "markdown",
"id": "25d8c4e0-ca60-4aeb-8aa9-9cfa8efdf52a",
"metadata": {},
"source": [
"### construction d'une fonction de généralisation de la méthode de calcul du biais\n",
"\n",
"Le biais est calculé de la façon suivante. \n",
"En notant $\\hat{p(x_i)}$ le score calculé et $p(x_i)$ le vrai score (sans biais), et $\\beta$ le logarithme du biais, on a : \\\n",
"$\\ln{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}} = \\beta + \\ln{\\frac{p(x_i)}{1-p(x_i)}}$ \\\n",
"$ \\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}} = \\exp(\\beta) . \\frac{p(x_i)}{1-p(x_i)} $ \\\n",
"Ce qu'on appelle biais et qu'on estime dans le code par la suite est : $B=\\exp(\\beta) $. Les probabilités ne sont donc pas biaisées si $B=1$. Il y a surestimation si $B>1$. \n",
"\n",
"On cherche le B qui permette d'ajuster les probabilités de telle sorte que la somme des scores soit égale à la somme des y_has_purchased. Cela revient à résoudre : \n",
"\n",
"\\begin{equation}\n",
"\\sum_{i}{\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}} = \\sum_{i}{Y_i}\n",
"\\end{equation}\n",
"\n",
"C'est ce que fait la fonction find_bias"
]
},
{
"cell_type": "code",
"execution_count": 733,
"id": "41f588ad-b093-47f9-a2c9-52428c61d8d8",
"metadata": {},
"outputs": [],
"source": [
"def adjusted_score(odd_ratio, bias) :\n",
" adjusted_score = odd_ratio/(bias+odd_ratio)\n",
" return adjusted_score"
]
},
{
"cell_type": "code",
"execution_count": 734,
"id": "208900ab-0211-4e0a-a235-e4ea3a6957ce",
"metadata": {},
"outputs": [],
"source": [
"# fonction qui prend un vecteur en entrée et remplace les 1 par la seconde plus grande valeur\n",
"# permet de remplacer les 1 par une valeur de score très proche, et d'ainsi éviter des odd ratio infinis\n",
"\n",
"def adjust_score_1(score) :\n",
" second_best_score = np.array([element for element in score if element !=1]).max()\n",
" new_score = np.array([element if element!=1 else second_best_score for element in score])\n",
" \n",
" return new_score\n"
]
},
{
"cell_type": "code",
"execution_count": 735,
"id": "942c3952-577e-4e18-87a8-e15ed3040241",
"metadata": {},
"outputs": [],
"source": [
"def odd_ratio(score) :\n",
" return score / (1 - score)"
]
},
{
"cell_type": "code",
"execution_count": 768,
"id": "f34e16f6-1596-492e-8ff2-0703173e815e",
"metadata": {},
"outputs": [],
"source": [
"# definition of a function that automatically detects the bias\n",
"\n",
"def find_bias(odd_ratios, y_objective, initial_guess=6) :\n",
" \"\"\"\n",
" results = minimize(lambda bias : (sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective)**2 ,\n",
" initial_guess , method = \"BFGS\")\n",
"\n",
" estimated_bias = results.x[0]\n",
" \"\"\"\n",
"\n",
" # faster method\n",
" bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n",
" \n",
" return bias_estimated[0]"
]
},
{
"cell_type": "code",
"execution_count": 761,
"id": "8cc3a658-5ab5-482b-ba26-b12a3bf9c81b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([6.0428265])"
]
},
"execution_count": 761,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# autre méthode : avec fsolve\n",
"\n",
"from scipy.optimize import fsolve\n",
"\n",
"bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n",
"bias_estimated"
]
},
{
"cell_type": "code",
"execution_count": 760,
"id": "92be0759-2583-411d-a0b0-f09fd53ff367",
"metadata": {},
"outputs": [],
"source": [
"import time"
]
},
{
"cell_type": "code",
"execution_count": 763,
"id": "58eb3320-fd4a-4b21-9cfe-6b9f7533a730",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"résultat : [6.0428265]\n",
"tps de calcul 2.112041473388672\n",
"résultat : 6.042826489667565\n",
"tps de calcul 3.9603891372680664\n"
]
}
],
"source": [
"# comparaison du temps pris par les deux opérations\n",
"\n",
"temps_debut = time.time()\n",
"bias_estimated_1 = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n",
"temps_fin = time.time()\n",
"\n",
"temps_ecoule = temps_fin - temps_debut\n",
"print(\"résultat : \",bias_estimated_1)\n",
"print(\"tps de calcul\", temps_ecoule)\n",
"\n",
"temps_debut = time.time()\n",
"bias_estimated_2 = minimize(lambda bias : (sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective)**2 ,\n",
" x0=6 , method = \"BFGS\").x[0]\n",
"temps_fin = time.time()\n",
"\n",
"temps_ecoule = temps_fin - temps_debut\n",
"print(\"résultat : \",bias_estimated_2)\n",
"print(\"tps de calcul\", temps_ecoule)"
]
},
{
"cell_type": "code",
"execution_count": 755,
"id": "5e6c5b4a-4a13-43ed-af96-e5892563057a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([2.28853049, 0.3231094 , 0.38810178, ..., 0.13186529, 1.37997272,\n",
" 0.34048672])"
]
},
"execution_count": 755,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"odd_ratios"
]
},
{
"cell_type": "code",
"execution_count": 749,
"id": "6ef9088a-3ae7-419a-b009-cb5aae4ab4c7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"36092.2248005385"
]
},
"execution_count": 749,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum([adjusted_score(element, 1) for element in list(odd_ratios)]) # - y_objective"
]
},
{
"cell_type": "code",
"execution_count": 704,
"id": "5fcd2467-9119-4bba-af38-f7833173c2d7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0, 1]"
]
},
"execution_count": 704,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[element for element in np.array([0,1])]"
]
},
{
"cell_type": "code",
"execution_count": 544,
"id": "e20820a3-30a4-4e24-8c65-6178c4d7e9c1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5.947447991192572"
]
},
"execution_count": 544,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# the function works well !!\n",
"\n",
"bias_train_set = find_bias(odd_ratios = X_train[\"odd_ratio\"], y_objective = y_train_sum, initial_guess = 6)\n",
"bias_train_set"
]
},
{
"cell_type": "code",
"execution_count": 716,
"id": "c17e4a3c-a3de-425b-a3da-1e15e33cb403",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([2.28853049, 0.3231094 , 0.38810178, ..., 0.13186529, 1.37997272,\n",
" 0.34048672])"
]
},
"execution_count": 716,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"odd_ratio = odd_ratio(adjust_score_1(X_test[\"score\"]))\n",
"odd_ratio"
]
},
{
"cell_type": "code",
"execution_count": 751,
"id": "0aad15bd-e820-4eda-b229-64bd1f90f7f5",
"metadata": {},
"outputs": [],
"source": [
"# definition of the values for the pb\n",
"\n",
"new_score = adjust_score_1(X_test[\"score\"])\n",
"\n",
"odd_ratios = odd_ratio(np.array(new_score))\n",
"\n",
"y_objective = y_test[\"y_has_purchased\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": 752,
"id": "498560c3-e446-4dcc-bb19-47f2910d5fbb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([0.69591281, 0.2442046 , 0.27959173, ..., 0.11650264, 0.57982712,\n",
" 0.25400231]),\n",
" array([2.28853049, 0.3231094 , 0.38810178, ..., 0.13186529, 1.37997272,\n",
" 0.34048672]),\n",
" 13690.0)"
]
},
"execution_count": 752,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_score, odd_ratios, y_objective"
]
},
{
"cell_type": "code",
"execution_count": 769,
"id": "03f4a8f1-f568-4a7d-9501-8a7467a9a864",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6.042826497117542"
]
},
"execution_count": 769,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# computation with the function defined\n",
"\n",
"bias_test_set = find_bias(odd_ratios = odd_ratios, \n",
" y_objective = y_objective,\n",
" initial_guess=6)\n",
"bias_test_set"
]
},
{
"cell_type": "code",
"execution_count": 770,
"id": "d0ea666d-33e8-46e8-9a4d-f17091dbfa93",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5.947447998640124"
]
},
"execution_count": 770,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"biais_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train[\"score\"])), \n",
" y_objective = y_train[\"y_has_purchased\"].sum(),\n",
" initial_guess=6)\n",
"biais_train_set"
]
},
{
"cell_type": "code",
"execution_count": 772,
"id": "1c1bdbc6-4fa7-45fb-ba27-b4c02ff1ff9c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5.947447991192572"
]
},
"execution_count": 772,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bias_train_set"
]
},
{
"cell_type": "code",
"execution_count": 776,
"id": "eced1d08-5230-4449-8024-105111fe5873",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"betâ test - betâ train = 0.015909647078591174\n"
]
}
],
"source": [
"# différence des beta (log du biais)\n",
"print(\"betâ test - betâ train = \",np.log(bias_test_set/bias_train_set))"
]
},
{
"cell_type": "markdown",
"id": "d2d5aca0-7e8b-4039-9bb2-ff5011c436a6",
"metadata": {},
"source": [
"## Random forest"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "da8873e5-c4e7-4580-8567-70e411c029ab",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>43000</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>14.0</td>\n",
" <td>12.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>183923</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>19.0</td>\n",
" <td>11.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97373</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66956</th>\n",
" <td>7.0</td>\n",
" <td>2.0</td>\n",
" <td>254.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>378.343062</td>\n",
" <td>370.453947</td>\n",
" <td>7.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116487</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>5.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83146</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>35.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>37.474040</td>\n",
" <td>37.474040</td>\n",
" <td>1.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>9.0</td>\n",
" <td>3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>223586</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>23.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>56489</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>141236</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>550.000000</td>\n",
" <td>550.000000</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>6.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6999</th>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>20.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>171.446921</td>\n",
" <td>171.446921</td>\n",
" <td>0.0</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10000 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"43000 0.0 0.0 0.0 0.0 \n",
"183923 0.0 0.0 0.0 0.0 \n",
"97373 0.0 0.0 0.0 0.0 \n",
"66956 7.0 2.0 254.0 1.0 \n",
"116487 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... \n",
"83146 1.0 1.0 35.0 1.0 \n",
"223586 0.0 0.0 0.0 0.0 \n",
"56489 0.0 0.0 0.0 0.0 \n",
"141236 0.0 0.0 0.0 0.0 \n",
"6999 2.0 1.0 20.0 1.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"43000 0.0 550.000000 550.000000 \n",
"183923 0.0 550.000000 550.000000 \n",
"97373 0.0 550.000000 550.000000 \n",
"66956 1.0 378.343062 370.453947 \n",
"116487 0.0 550.000000 550.000000 \n",
"... ... ... ... \n",
"83146 1.0 37.474040 37.474040 \n",
"223586 0.0 550.000000 550.000000 \n",
"56489 0.0 550.000000 550.000000 \n",
"141236 0.0 550.000000 550.000000 \n",
"6999 0.0 171.446921 171.446921 \n",
"\n",
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
"43000 0.0 True True 0 \n",
"183923 0.0 True True 0 \n",
"97373 0.0 True False 0 \n",
"66956 7.0 True False 0 \n",
"116487 0.0 True False 1 \n",
"... ... ... ... ... \n",
"83146 1.0 True False 0 \n",
"223586 0.0 True True 0 \n",
"56489 0.0 True True 0 \n",
"141236 0.0 True False 0 \n",
"6999 0.0 True True 1 \n",
"\n",
" gender_male nb_campaigns nb_campaigns_opened \n",
"43000 1 14.0 12.0 \n",
"183923 1 19.0 11.0 \n",
"97373 0 7.0 2.0 \n",
"66956 1 0.0 0.0 \n",
"116487 0 5.0 0.0 \n",
"... ... ... ... \n",
"83146 1 9.0 3.0 \n",
"223586 1 23.0 1.0 \n",
"56489 1 4.0 0.0 \n",
"141236 1 6.0 0.0 \n",
"6999 0 0.0 0.0 \n",
"\n",
"[10000 rows x 14 columns]"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train_subsample"
]
},
{
"cell_type": "markdown",
"id": "fcbb8bea-e9d3-4fd4-8b47-7e796c788a1f",
"metadata": {},
"source": [
"### Preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "55e0c6d8-9e98-47be-9d5d-41e06505ceba",
"metadata": {},
"outputs": [],
"source": [
"# no need to standardize variables in a random forest\n",
"# we just encode categorical variables\n",
"\n",
"categorical_features = ['opt_in', 'is_email_true'] \n",
"\n",
"# Transformer for the categorical features\n",
"categorical_transformer = Pipeline(steps=[\n",
" #(\"imputer\", SimpleImputer(strategy=\"most_frequent\")), # Impute missing values with the most frequent\n",
" (\"onehot\", OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n",
"])\n",
"\n",
"preproc = ColumnTransformer(\n",
" transformers=[\n",
" (\"cat\", categorical_transformer, categorical_features)\n",
" ]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "27af28da-d2bb-4eff-b842-18cec9740c84",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-2 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-2 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-2 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-2 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-2 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-2 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-2 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-2 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-2 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-2 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-2 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-2 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-2 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-2 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-2 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-2 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-2 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-2 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-2 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>ColumnTransformer(transformers=[(&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;])])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;&nbsp;ColumnTransformer<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for ColumnTransformer</span></a><span class=\"sk-estimator-doc-link \">i<span>Not fitted</span></span></label><div class=\"sk-toggleable__content \"><pre>ColumnTransformer(transformers=[(&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">cat</label><div class=\"sk-toggleable__content \"><pre>[&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;OneHotEncoder<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content \"><pre>OneHotEncoder(handle_unknown=&#x27;ignore&#x27;, sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div></div></div>"
],
"text/plain": [
"ColumnTransformer(transformers=[('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in', 'is_email_true'])])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"preproc"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0cb46acb-647f-469d-b5e1-510bf1283196",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ce9acf4-3514-4056-a71a-c7654e25b9de",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "dfdd4601-4866-4102-b620-4f10648e7981",
"metadata": {},
"source": [
"### Pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eeefae73-afe7-4441-a04c-bd6a04beedd2",
"metadata": {},
"outputs": [],
"source": [
"# Define models and parameters for GridSearch\n",
"model = {\n",
" 'model': RandomForestClassifier(),\n",
" 'params': {\n",
" 'randforest__n_estimators': [100, 150, 200, 250, 300],\n",
" 'randforest__max_depth': [None, 15, 20, 25, 30, 35, 40],\n",
" }\n",
" }\n",
"\n",
"# Test each model using GridSearchCV\n",
"pipe = Pipeline(steps=[('preprocessor', preproc), ('randforest', model['model'])])\n",
"clf = GridSearchCV(pipe, model['params'], cv=3)\n",
"clf.fit(X_train, y_train)\n",
"\n",
"print(f\"Model: {model['model']}\")\n",
"print(f\"Best parameters: {clf.best_params_}\")\n",
"print('Best classification accuracy in train is: {}'.format(clf.best_score_))\n",
"print('Classification accuracy on test is: {}'.format(clf.score(X_test, y_test)))\n",
"print(\"------\")"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "2a88f13b-05bc-4a70-b08b-8b07c118cedc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-7 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-7 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-7 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-7 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-7 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-7 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-7 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-7 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-7 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-7 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-7 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-7 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-7 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-7 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-7 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-7 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-7 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-7 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-7 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-7 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-7 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-7 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-7 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-7 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-7 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-7 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-7\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;random_forest&#x27;,\n",
" RandomForestClassifier(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-27\" type=\"checkbox\" ><label for=\"sk-estimator-id-27\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link \">i<span>Not fitted</span></span></label><div class=\"sk-toggleable__content \"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;random_forest&#x27;,\n",
" RandomForestClassifier(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-28\" type=\"checkbox\" ><label for=\"sk-estimator-id-28\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content \"><pre>ColumnTransformer(transformers=[(&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-29\" type=\"checkbox\" ><label for=\"sk-estimator-id-29\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">cat</label><div class=\"sk-toggleable__content \"><pre>[&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-30\" type=\"checkbox\" ><label for=\"sk-estimator-id-30\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;OneHotEncoder<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content \"><pre>OneHotEncoder(handle_unknown=&#x27;ignore&#x27;, sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-31\" type=\"checkbox\" ><label for=\"sk-estimator-id-31\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;RandomForestClassifier<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a></label><div class=\"sk-toggleable__content \"><pre>RandomForestClassifier(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539})</pre></div> </div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in',\n",
" 'is_email_true'])])),\n",
" ('random_forest',\n",
" RandomForestClassifier(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}))])"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Pipeline - on joue sur : max_depth\n",
"\n",
"param_grid = {\"random_forest__max_depth\" : [None, 10, 20, 40, 50, 60]}\n",
"\n",
"pipeline = Pipeline(steps=[\n",
" ('preprocessor', preproc),\n",
" ('random_forest', RandomForestClassifier(bootstrap = False, class_weight = weight_dict,\n",
" )) \n",
"])\n",
"\n",
"pipeline.set_output(transform=\"pandas\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "494dca83-4d60-4e49-8689-7d7ac612bb83",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'estimator': DecisionTreeClassifier(),\n",
" 'n_estimators': 100,\n",
" 'estimator_params': ('criterion',\n",
" 'max_depth',\n",
" 'min_samples_split',\n",
" 'min_samples_leaf',\n",
" 'min_weight_fraction_leaf',\n",
" 'max_features',\n",
" 'max_leaf_nodes',\n",
" 'min_impurity_decrease',\n",
" 'random_state',\n",
" 'ccp_alpha',\n",
" 'monotonic_cst'),\n",
" 'bootstrap': True,\n",
" 'oob_score': False,\n",
" 'n_jobs': None,\n",
" 'random_state': None,\n",
" 'verbose': 0,\n",
" 'warm_start': False,\n",
" 'class_weight': None,\n",
" 'max_samples': None,\n",
" 'criterion': 'gini',\n",
" 'max_depth': None,\n",
" 'min_samples_split': 2,\n",
" 'min_samples_leaf': 1,\n",
" 'min_weight_fraction_leaf': 0.0,\n",
" 'max_features': 'sqrt',\n",
" 'max_leaf_nodes': None,\n",
" 'min_impurity_decrease': 0.0,\n",
" 'monotonic_cst': None,\n",
" 'ccp_alpha': 0.0}"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"RandomForestClassifier().__dict__"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "ee7cbc1c-7c31-4111-82a3-995141e2f13f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-8 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-8 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-8 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-8 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-8 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-8 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-8 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-8 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-8 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-8 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-8 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-8 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-8 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-8 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-8 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-8 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-8 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-8 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-8 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-8 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-8 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-8 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-8 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-8 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-8 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-8 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-8\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GridSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;random_forest&#x27;,\n",
" RandomForestClassifier(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}))]),\n",
" param_grid={&#x27;random_forest__max_depth&#x27;: [None, 10, 20, 40, 50,\n",
" 60]},\n",
" scoring=make_scorer(f1_score, response_method=&#x27;predict&#x27;))</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-32\" type=\"checkbox\" ><label for=\"sk-estimator-id-32\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;&nbsp;GridSearchCV<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.model_selection.GridSearchCV.html\">?<span>Documentation for GridSearchCV</span></a><span class=\"sk-estimator-doc-link \">i<span>Not fitted</span></span></label><div class=\"sk-toggleable__content \"><pre>GridSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;random_forest&#x27;,\n",
" RandomForestClassifier(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}))]),\n",
" param_grid={&#x27;random_forest__max_depth&#x27;: [None, 10, 20, 40, 50,\n",
" 60]},\n",
" scoring=make_scorer(f1_score, response_method=&#x27;predict&#x27;))</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-33\" type=\"checkbox\" ><label for=\"sk-estimator-id-33\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">estimator: Pipeline</label><div class=\"sk-toggleable__content \"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;,\n",
" &#x27;is_email_true&#x27;])])),\n",
" (&#x27;random_forest&#x27;,\n",
" RandomForestClassifier(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-34\" type=\"checkbox\" ><label for=\"sk-estimator-id-34\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content \"><pre>ColumnTransformer(transformers=[(&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-35\" type=\"checkbox\" ><label for=\"sk-estimator-id-35\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">cat</label><div class=\"sk-toggleable__content \"><pre>[&#x27;opt_in&#x27;, &#x27;is_email_true&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-36\" type=\"checkbox\" ><label for=\"sk-estimator-id-36\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;OneHotEncoder<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content \"><pre>OneHotEncoder(handle_unknown=&#x27;ignore&#x27;, sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-37\" type=\"checkbox\" ><label for=\"sk-estimator-id-37\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;RandomForestClassifier<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a></label><div class=\"sk-toggleable__content \"><pre>RandomForestClassifier(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539})</pre></div> </div></div></div></div></div></div></div></div></div></div></div>"
],
"text/plain": [
"GridSearchCV(cv=3,\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in',\n",
" 'is_email_true'])])),\n",
" ('random_forest',\n",
" RandomForestClassifier(class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}))]),\n",
" param_grid={'random_forest__max_depth': [None, 10, 20, 40, 50,\n",
" 60]},\n",
" scoring=make_scorer(f1_score, response_method='predict'))"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# pipeline on the subsample\n",
"\n",
"random_forest_grid = GridSearchCV(pipeline, param_grid, cv=3, scoring = f1_scorer #, error_score=\"raise\"\n",
" )\n",
"\n",
"random_forest_grid"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "3f149137-6313-4b4e-99d6-b3af7f296ad7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n",
"/opt/mamba/lib/python3.11/site-packages/sklearn/base.py:1351: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
" return fit_method(estimator, *args, **kwargs)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Returned hyperparameter: {'random_forest__max_depth': None}\n",
"Best classification F1 score in train is: 0.33107422141513826\n",
"Classification F1 score on test is: 0.31752789604029275\n"
]
}
],
"source": [
"# run the pipeline on the full sample\n",
"\n",
"random_forest_grid.fit(X_train, y_train)\n",
"\n",
"# print results\n",
"print('Returned hyperparameter: {}'.format(random_forest_grid.best_params_))\n",
"print('Best classification F1 score in train is: {}'.format(random_forest_grid.best_score_))\n",
"print('Classification F1 score on test is: {}'.format(random_forest_grid.score(X_test, y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "cd79f942-abd0-48c9-aa0d-0d22673abeec",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'scoring': make_scorer(f1_score, response_method='predict'),\n",
" 'estimator': Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in',\n",
" 'is_email_true'])])),\n",
" ('random_forest',\n",
" RandomForestClassifier(bootstrap=False,\n",
" class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}))]),\n",
" 'n_jobs': None,\n",
" 'refit': True,\n",
" 'cv': 3,\n",
" 'verbose': 0,\n",
" 'pre_dispatch': '2*n_jobs',\n",
" 'error_score': nan,\n",
" 'return_train_score': False,\n",
" 'param_grid': {'random_forest__max_depth': [None, 10, 20, 40, 50, 60]},\n",
" 'multimetric_': False,\n",
" 'best_index_': 0,\n",
" 'best_score_': 0.33107422141513826,\n",
" 'best_params_': {'random_forest__max_depth': None},\n",
" 'best_estimator_': Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in',\n",
" 'is_email_true'])])),\n",
" ('random_forest',\n",
" RandomForestClassifier(bootstrap=False,\n",
" class_weight={0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}))]),\n",
" 'refit_time_': 2.2247676849365234,\n",
" 'feature_names_in_': array(['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
" 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',\n",
" 'nb_tickets_internet', 'is_email_true', 'opt_in', 'gender_female',\n",
" 'gender_male', 'nb_campaigns', 'nb_campaigns_opened'], dtype=object),\n",
" 'scorer_': make_scorer(f1_score, response_method='predict'),\n",
" 'cv_results_': {'mean_fit_time': array([1.64734515, 1.4220806 , 1.43256299, 1.68632547, 1.4271005 ,\n",
" 1.42404906]),\n",
" 'std_fit_time': array([0.32811727, 0.01915 , 0.02151065, 0.2729267 , 0.02447776,\n",
" 0.02384922]),\n",
" 'mean_score_time': array([0.14065607, 0.13571024, 0.13531415, 0.17512798, 0.13398822,\n",
" 0.13499872]),\n",
" 'std_score_time': array([0.00759402, 0.00653712, 0.00743453, 0.04901062, 0.00848726,\n",
" 0.00789539]),\n",
" 'param_random_forest__max_depth': masked_array(data=[None, 10, 20, 40, 50, 60],\n",
" mask=[False, False, False, False, False, False],\n",
" fill_value='?',\n",
" dtype=object),\n",
" 'params': [{'random_forest__max_depth': None},\n",
" {'random_forest__max_depth': 10},\n",
" {'random_forest__max_depth': 20},\n",
" {'random_forest__max_depth': 40},\n",
" {'random_forest__max_depth': 50},\n",
" {'random_forest__max_depth': 60}],\n",
" 'split0_test_score': array([0.19168873, 0.19168873, 0.19168873, 0.19168873, 0.19168873,\n",
" 0.19168873]),\n",
" 'split1_test_score': array([0.34428494, 0.34428494, 0.34428494, 0.34428494, 0.34428494,\n",
" 0.34428494]),\n",
" 'split2_test_score': array([0.45724899, 0.45724899, 0.45724899, 0.45724899, 0.45724899,\n",
" 0.45724899]),\n",
" 'mean_test_score': array([0.33107422, 0.33107422, 0.33107422, 0.33107422, 0.33107422,\n",
" 0.33107422]),\n",
" 'std_test_score': array([0.10881622, 0.10881622, 0.10881622, 0.10881622, 0.10881622,\n",
" 0.10881622]),\n",
" 'rank_test_score': array([1, 1, 1, 1, 1, 1], dtype=int32)},\n",
" 'n_splits_': 3}"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random_forest_grid.__dict__"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "1806fe6d-cf98-459d-b05a-eb95972281dc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy Score: 0.48955211455211456\n",
"F1 Score: 0.31752789604029275\n",
"Recall Score: 0.8335281227173119\n"
]
}
],
"source": [
"# print results for the best model\n",
"\n",
"y_pred = random_forest_grid.predict(X_test)\n",
"\n",
"# Calculate the F1 score\n",
"acc = accuracy_score(y_test, y_pred)\n",
"print(f\"Accuracy Score: {acc}\")\n",
"\n",
"f1 = f1_score(y_test, y_pred)\n",
"print(f\"F1 Score: {f1}\")\n",
"\n",
"recall = recall_score(y_test, y_pred)\n",
"print(f\"Recall Score: {recall}\")"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "1a6a8e07-bd93-496b-986e-d219c03b82c5",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAi0AAAHFCAYAAAA+FskAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABhdklEQVR4nO3deVxU5f4H8M/IMgLCyCIMKJomkoRbmICWu4CyZFZqGEkZet2IBO2i5VIp7pZiaqZiLtFimqlxUVGKiygiJCipJYpeGXEZByEcEM/vD3+eGgEFPeMw+nnf13m9nHO+5znPmdvU1+/zPOfIBEEQQERERNTANTJ0B4iIiIjqgkkLERERGQUmLURERGQUmLQQERGRUWDSQkREREaBSQsREREZBSYtREREZBSYtBAREZFRYNJCRERERoFJCz3Wjh49irfeegutW7dG48aN0aRJEzz33HOYP38+rl69qtdrZ2dno1evXlAoFJDJZPj0008lv4ZMJsPMmTMlb/d+EhISIJPJIJPJsH///mrHBUFA27ZtIZPJ0Lt37we6xueff46EhIR6nbN///5a+0RExs/U0B0g0pfVq1dj3LhxcHd3x+TJk+Hh4YHKykocPnwYK1euxIEDB7B161a9Xf/tt99GWVkZEhMTYWtri6eeekryaxw4cAAtWrSQvN26sra2xpo1a6olJqmpqfjzzz9hbW39wG1//vnncHBwQHh4eJ3Pee6553DgwAF4eHg88HWJqOFi0kKPpQMHDmDs2LEYMGAAtm3bBrlcLh4bMGAAoqOjkZSUpNc+5OXlISIiAgMHDtTbNXx8fPTWdl0MGzYMmzZtwvLly2FjYyPuX7NmDXx9fVFSUvJI+lFZWQmZTAYbGxuDfydEpD8cHqLH0pw5cyCTyfDFF1/oJCx3mJubIyQkRPx869YtzJ8/H8888wzkcjkcHR3x5ptv4vz58zrn9e7dG56ensjMzMSLL74IS0tLtGnTBnPnzsWtW7cA/D10cvPmTaxYsUIcRgGAmTNnin/+pzvnnDlzRtyXkpKC3r17w97eHhYWFmjZsiVeeeUV/PXXX2JMTcNDeXl5eOmll2Bra4vGjRujc+fOWL9+vU7MnWGUr7/+GtOmTYOLiwtsbGzQv39/nDhxom5fMoDXX38dAPD111+L+zQaDbZs2YK33367xnNmzZoFb29v2NnZwcbGBs899xzWrFmDf7679amnnsKxY8eQmpoqfn93KlV3+r5hwwZER0ejefPmkMvl+OOPP6oND12+fBmurq7o3r07KisrxfaPHz8OKysrhIWF1fleicjwmLTQY6eqqgopKSnw8vKCq6trnc4ZO3Ys3n//fQwYMADbt2/Hxx9/jKSkJHTv3h2XL1/WiVWpVBgxYgTeeOMNbN++HQMHDkRsbCw2btwIAAgMDMSBAwcAAK+++ioOHDggfq6rM2fOIDAwEObm5li7di2SkpIwd+5cWFlZoaKiotbzTpw4ge7du+PYsWNYunQpfvjhB3h4eCA8PBzz58+vFj916lScPXsWX375Jb744gucOnUKwcHBqKqqqlM/bWxs8Oqrr2Lt2rXivq+//hqNGjXCsGHDar23MWPG4Ntvv8UPP/yAIUOGYOLEifj444/FmK1bt6JNmzbo0qWL+P3dPZQXGxuLwsJCrFy5Ej/99BMcHR2rXcvBwQGJiYnIzMzE+++/DwD466+/8Nprr6Fly5ZYuXJlne6TiBoIgegxo1KpBADC8OHD6xSfn58vABDGjRuns//gwYMCAGHq1Knivl69egkAhIMHD+rEenh4CP7+/jr7AAjjx4/X2Tdjxgyhpp/dunXrBABCQUGBIAiC8P333wsAhJycnHv2HYAwY8YM8fPw4cMFuVwuFBYW6sQNHDhQsLS0FK5duyYIgiDs27dPACAMGjRIJ+7bb78VAAgHDhy453Xv9DczM1NsKy8vTxAEQXj++eeF8PBwQRAE4dlnnxV69epVaztVVVVCZWWl8NFHHwn29vbCrVu3xGO1nXvnej179qz12L59+3T2z5s3TwAgbN26VRg5cqRgYWEhHD169J73SEQNDyst9MTbt28fAFSb8NmtWze0b98ee/fu1dmvVCrRrVs3nX0dO3bE2bNnJetT586dYW5ujtGjR2P9+vU4ffp0nc5LSUlBv379qlWYwsPD8ddff1Wr+PxziAy4fR8A6nUvvXr1wtNPP421a9ciNzcXmZmZtQ4N3elj//79oVAoYGJiAjMzM0yfPh1XrlxBcXFxna/7yiuv1Dl28uTJCAwMxOuvv47169dj2bJl6NChQ53PJ6KGgUkLPXYcHBxgaWmJgoKCOsVfuXIFAODs7FztmIuLi3j8Dnt7+2pxcrkc5eXlD9Dbmj399NPYs2cPHB0dMX78eDz99NN4+umn8dlnn93zvCtXrtR6H3eO/9Pd93Jn/k997kUmk+Gtt97Cxo0bsXLlSrRr1w4vvvhijbGHDh2Cn58fgNuru/773/8iMzMT06ZNq/d1a7rPe/UxPDwcN27cgFKp5FwWIiPFpIUeOyYmJujXrx+ysrKqTaStyZ3/cBcVFVU7duHCBTg4OEjWt8aNGwMAtFqtzv67580AwIsvvoiffvoJGo0GGRkZ8PX1RVRUFBITE2tt397evtb7ACDpvfxTeHg4Ll++jJUrV+Ktt96qNS4xMRFmZmbYsWMHhg4diu7du6Nr164PdM2aJjTXpqioCOPHj0fnzp1x5coVxMTEPNA1iciwmLTQYyk2NhaCICAiIqLGiauVlZX46aefAAB9+/YFAHEi7R2ZmZnIz89Hv379JOvXnRUwR48e1dl/py81MTExgbe3N5YvXw4AOHLkSK2x/fr1Q0pKipik3PHVV1/B0tJSb8uBmzdvjsmTJyM4OBgjR46sNU4mk8HU1BQmJibivvLycmzYsKFarFTVq6qqKrz++uuQyWT4+eefERcXh2XLluGHH3546LaJ6NHic1roseTr64sVK1Zg3Lhx8PLywtixY/Hss8+isrIS2dnZ+OKLL+Dp6Yng4GC4u7tj9OjRWLZsGRo1aoSBAwfizJkz+PDDD+Hq6or33ntPsn4NGjQIdnZ2GDVqFD766COYmpoiISEB586d04lbuXIlUlJSEBgYiJYtW+LGjRviCp3+/fvX2v6MGTOwY8cO9OnTB9OnT4ednR02bdqEnTt3Yv78+VAoFJLdy93mzp1735jAwEAsXrwYoaGhGD16NK5cuYKFCxfWuCy9Q4cOSExMxDfffIM2bdqgcePGDzQPZcaMGfj111+RnJwMpVKJ6OhopKamYtSoUejSpQtat25d7zaJyDCYtNBjKyIiAt26dcOSJUswb948qFQqmJmZoV27dggNDcWECRPE2BUrVuDpp5/GmjVrsHz5cigUCgQEBCAuLq7GOSwPysbGBklJSYiKisIbb7yBpk2b4p133sHAgQPxzjvviHGdO3dGcnIyZsyYAZVKhSZNmsDT0xPbt28X54TUxN3dHenp6Zg6dSrGjx+P8vJytG/fHuvWravXk2X1pW/fvli7di3mzZuH4OBgNG/eHBEREXB0dMSoUaN0YmfNmoWioiJERETg+vXraNWqlc5zbOpi9+7diIuLw4cffqhTMUtISECXLl0wbNgwpKWlwdzcXIrbIyI9kwnCP57oRERERNRAcU4LERERGQUmLURERGQUmLQQERGRUWDSQkREREaBSQsREREZBSYtREREZBSYtBAREZFReCwfLtdm0i5Dd4GoQSrax98G0d3Ks+P1fg2LLhPuH1QHj6KvDRkrLURERGQUHstKCxERUYMiY41ACkxaiIiI9E0mM3QPHgtMWoiIiPSNlRZJ8FskIiIio8BKCxERkb5xeEgSTFqIiIj0jcNDkuC3SEREREaBlRYiIiJ94/CQJJi0EBER6RuHhyTBb5GIiOgJEBcXB5lMhqioKHFfeHg4ZDKZzubj46NznlarxcSJE+Hg4AArKyuEhITg/PnzOjFqtRphYWFQKBRQKBQICwvDtWvXdGIKCwsRHBwMKysrODg4IDIyEhUVFfW6ByYtRERE+iaTSbM9oMzMTHzxxRf
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# confusion matrix \n",
"\n",
"draw_confusion_matrix(y_test, y_pred)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "1e1b3e42-1075-4a4a-bf44-3dadde3dbed1",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABIQAAAK8CAYAAACeK2TMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3QUhdfG8W96LwQIJSQ0qVJCEQWUKihFQQERFdGf2LC8YsfeC6LYsXdFRAEVEMGCSFEpAQSkl4QSIIT0np33j4HgsAETSDK72edzDsfsZDa5QZ5suMzc62UYhoGIiIiIiIiIiHgMb7sLEBERERERERGRqqWGkIiIiIiIiIiIh1FDSERERERERETEw6ghJCIiIiIiIiLiYdQQEhERERERERHxMGoIiYiIiIiIiIh4GDWEREREREREREQ8jBpCIiIiIiIiIiIeRg0hEREREREREREPo4aQiIiIiJt7/PHHqVmzJrt27bK7FBEREXETagiJiIhUorVr13LttdfSuHFjAgMDCQ0NpWPHjkycOJHU1FRbavroo4/w8vJixYoVlfp5du7ciZeXV8kvb29vatSoQd++fZk/f/4Jnzdv3jwGDRpE7dq1CQgIIDY2ljFjxrBhw4YTPuf333/nsssuIyYmBn9/fyIiIujWrRtTpkwhOzu7Mr48Wxz9f7dz507L8UcffZRLLrmEESNGUFBQUOpzH3vsMby8vCqsloULF+Ll5cXChQsr7GOWplGjRlxzzTXles7SpUt57LHHSEtLc3pfr1696NWrV4XUJiIi4s7UEBIREakk7777Lp06dWL58uXcc889zJs3j5kzZzJixAjeeustrrvuOrtLrBK33XYby5Yt4/fff2fSpEls2bKFgQMHsmjRIqdz7733XgYMGIDD4eDNN99kwYIFPProoyxfvpyOHTsyY8YMp+c8+uij9OjRgz179vDkk0+yYMECvvzyS/r27ctjjz3GQw89VBVfpu3eeustateuzfjx4+0upULNnDmThx9+uFzPWbp0KY8//nipDaE333yTN998s4KqExERcV++dhcgIiJSHS1btoybb76Zfv36MWvWLAICAkre169fP+666y7mzZtXpTUVFhZW6BUiZRUXF8c555wDQPfu3WnWrBk9e/bk/fffp0ePHiXnTZ06lRdeeIGbb77Z8hf2Hj16MGrUKHr27Mno0aOJj4+nSZMmAEyfPp0nnniC6667jnfffdfy9Q0YMIB7772XZcuWVdFXai9fX1/mzJljdxkVrkOHDhX68Vq3bl2hH09ERMRd6QohERGRSvDMM8/g5eXFO++8Y2kGHeXv78/FF19c8tjhcDBx4kRatmxJQEAA0dHRXH311ezevdvyvBPdPnP8bTBHb+f59NNPueuuu4iJiSEgIICtW7eWnHP48GGuvfZaoqKiCAkJ4aKLLmL79u1OH/unn36ib9++hIeHExwcTPfu3fn5559P4XfF1LlzZwD2799vOf70009To0YNJk2a5PSckJAQXnvtNXJycpg8eXLJ8SeeeIIaNWrw6quvltrsCgsLo3///qdc6/F69epFmzZtWLZsGd26dSMoKIhGjRrx4YcfAjBnzhw6duxIcHAwbdu2LbXpt3jxYvr27UtYWBjBwcF069at1EbOH3/8Qffu3QkMDKR+/fpMmDCBwsLCUuuaNm0aXbt2JSQkhNDQUPr378/KlSvL9DUd/9wLLriAhISEcvyuWH333Xd07dqV4OBgwsLC6NevX6lNuW+//ZZ27doREBBAkyZNeOWVV0q9re34P/MOh4OnnnqKFi1aEBQURGRkJO3ateOVV14BzFvj7rnnHgAaN25ccsvi0VvbSrtlbO/evVx22WWEhYURERHByJEj+eOPP/Dy8uKjjz4qOe9Et5tdc801NGrUyHKsoKCAp556qiTTtWvX5tprr+XgwYNl+40UERGpZGoIiYiIVLDi4mJ++eUXOnXqRGxsbJmec/PNN3PffffRr18/vvvuO5588knmzZtHt27dSElJOeVaJkyYQGJiIm+99Rbff/890dHRJe+77rrr8Pb25osvvuDll1/mr7/+olevXpbbbD777DP69+9PeHg4H3/8MV999RVRUVFccMEFp9wU2rFjBwDNmzcvObZv3z7Wr19P//79CQ4OLvV5Xbt2JTo6mgULFpQ8Z926dSd9TlkcbZ499thjZTo/OTmZa6+9lrFjx/Ltt9/Stm1b/ve///HEE08wYcIE7r33Xr755htCQ0MZOnQoe/fuLXnub7/9Rp8+fUhPT+f9999n6tSphIWFcdFFFzFt2rSS8zZs2EDfvn1JS0vjo48+4q233iIhIYGnnnrKqZ5nnnmGUaNG0bp1a7766is++eQTMjIyOO+881i3bt1Jv5bjn/vpp5+SmZnJeeedd9KZTSfyxRdfMGTIEMLDw5k6dSrvv/8+hw8fplevXixevLjkvHnz5nHppZdSs2ZNpk2bxsSJE5k6dSoff/zxf36OiRMn8thjjzFq1CjmzJnDtGnTuO6660r+3I4dO5bbbrsNgBkzZrBs2TKWLVtGx44dS/14ubm5nH/++cyfP59nn32W6dOnU7duXUaOHFnur/8oh8PBkCFDeO6557jiiiuYM2cOzz33HAsWLKBXr17k5uae8scWERGpMIaIiIhUqOTkZAMwLr/88jKd/88//xiAMW7cOMvxP//80wCMBx54oORYw4YNjTFjxjh9jJ49exo9e/Ysefzrr78agNGjRw+ncz/88EMDMC655BLL8SVLlhiA8dRTTxmGYRjZ2dlGVFSUcdFFF1nOKy4uNtq3b2906dLlpF/Xjh07DMB4/vnnjcLCQiMvL89YvXq10bVrV6NevXrGjh07Ss79448/DMC4//77T/oxzz77bCMoKKhcz/kvCxcuNHx8fIzHH3/8P8/t2bOnARgrVqwoOXbo0CHDx8fHCAoKMvbs2VNyfPXq1QZgvPrqqyXHzjnnHCM6OtrIzMwsOVZUVGS0adPGaNCggeFwOAzDMIyRI0caQUFBRnJysuW8li1bGkDJ711iYqLh6+tr3HLLLZY6MzIyjOjoaGP48OElxx599FHj3z/6HX3ubbfdZnluZmamUbduXeOyyy476e/F0T9jv/76q2EY5p+L+vXrG23btjWKi4stHy86Otro1q1bybGzzjrLiI2NNfLz8y3n1axZ0zj+x9Pj/8wPHjzYiI+PP2ltL7zwguX36d+Oz8qUKVMMwPj2228t511//fUGYHz44YcnfO5RY8aMMRo2bFjyeOrUqQZgfPPNN5bzli9fbgDGm2++edL6RUREqoKuEBIREbHZr7/+CuB0K1iXLl1o1arVad2eNWzYsBO+78orr7Q87tatGw0bNiypZ+nSpaSmpjJmzBiKiopKfjkcDi688EKWL19epg1e9913H35+fgQGBhIfH8+6dev4/vvvnW6xKQvDMCp8DlLPnj0pKirikUceKdP59erVo1OnTiWPo6KiiI6OJj4+nvr165ccb9WqFUDJKvjs7Gz+/PNPhg8fTmhoaMl5Pj4+jB49mt27d7Np0ybA/DPRt29f6tSpYznv+KtWfvzxR4qKivjf//5nOR4WFkbv3r357bffTvh1HH3u1Vdfbfn/GxgYSM+ePcu9PWzTpk3s3buX0aNH4+197EfM0NBQhg0bxh9//EFOTg7Z2dmsWLGCoUOH4u/vbznvoosu+s/P06VLF9asWcO4ceP48ccfycjIKFedx/v1118JCwuz3MIJcMUVV5zyx5w9ezaRkZFcdNFFlt/b+Ph46tatW+mb2URERMpCQ6VFREQqWK1atQgODi65Neq/HDp0CDAbDcerX79+SUPhVJT2MY+qW7duqceO1nN0xs/w4cNP+DFSU1MJCQk5aQ3/93//x1VXXUV+fj5//PEHDz30EEOGDGHNmjXUrFkTMAdPA//5e7Zr166S2/DK+pyKFhUV5XTM39/f6fjRZkdeXh5gzmwyDOOE/5/h2J+FQ4cOnfD/z78d/X/UrVs3p3OPNu9O5OhzzzrrrFLf/++mTln8159jh8NR8ntgGIal2XVUaceON2H
"text/plain": [
"<Figure size 1400x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ROC curve\n",
"\n",
"# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)\n",
"y_pred_prob = random_forest_grid.predict_proba(X_test)[:, 1]\n",
"\n",
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)\n",
"\n",
"# Calcul de l'aire sous la courbe ROC (AUC)\n",
"roc_auc = auc(fpr, tpr)\n",
"\n",
"plt.figure(figsize = (14, 8))\n",
"plt.plot(fpr, tpr, label=\"ROC curve(area = %0.3f)\" % roc_auc)\n",
"plt.plot([0, 1], [0, 1], color=\"red\",label=\"Random Baseline\", linestyle=\"--\")\n",
"plt.grid(color='gray', linestyle='--', linewidth=0.5)\n",
"plt.xlabel('Taux de faux positifs (FPR)')\n",
"plt.ylabel('Taux de vrais positifs (TPR)')\n",
"plt.title('Courbe ROC : random forest')\n",
"plt.legend(loc=\"lower right\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "854f6242-813f-400a-be43-7414a859b355",
"metadata": {},
"source": [
"## Naive Bayes "
]
},
{
"cell_type": "code",
"execution_count": 219,
"id": "b083d10d-8510-4a07-974b-e0c324175d7f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-5 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-5 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-5 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-5 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-5 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-5 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-5 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-5 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-5 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-5 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-5 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-5 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-5 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-5 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-5 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-5 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-5 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-5 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-5\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>GaussianNB()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-24\" type=\"checkbox\" checked><label for=\"sk-estimator-id-24\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;GaussianNB<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.naive_bayes.GaussianNB.html\">?<span>Documentation for GaussianNB</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>GaussianNB()</pre></div> </div></div></div></div>"
],
"text/plain": [
"GaussianNB()"
]
},
"execution_count": 219,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf = GaussianNB()\n",
"clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 234,
"id": "a5459639-be3d-4292-89d2-061f276dc9a8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy Score: 0.8780906593406593\n",
"F1 Score: 0.3673381217259815\n",
"Recall Score: 0.24842951059167276\n"
]
}
],
"source": [
"# print results for the best model\n",
"\n",
"y_pred = clf.predict(X_test)\n",
"\n",
"# Calculate the F1 score\n",
"acc = accuracy_score(y_test, y_pred)\n",
"print(f\"Accuracy Score: {acc}\")\n",
"\n",
"f1 = f1_score(y_test, y_pred)\n",
"print(f\"F1 Score: {f1}\")\n",
"\n",
"recall = recall_score(y_test, y_pred)\n",
"print(f\"Recall Score: {recall}\")"
]
},
{
"cell_type": "code",
"execution_count": 239,
"id": "22d3d4d0-36b4-4561-9bc7-3a408914f089",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"somme des probas de y prédites : 4889.8913137503505\n",
"nombre de y valant 1 : y_has_purchased 13690.0\n",
"dtype: float64\n"
]
}
],
"source": [
"# le bayes naif sous-estime les probas d'achat (les autres modèles surestiment pr avoir un bon recall) w\n",
"print(f\"somme des probas de y prédites : {y_pred_prob.sum()}\")\n",
"print(f\"nombre de y valant 1 : {y_test.sum()}\")"
]
},
{
"cell_type": "code",
"execution_count": 236,
"id": "e962eeed-4099-407b-a619-a34a539a404a",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAABIQAAAK7CAYAAACDLlR0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd3gU5frG8e+mNxIIIQFC7026IB1EUIqKFJGjHkSwHI56xA7YEBUVFI96rD97QUQBC4igiIig0qT33gKEQEJ62fn9MRAYFjCBJLObuT/XlYudd2d3nwXuneTJzPu6DMMwEBERERERERERx/CzuwARERERERERESlZagiJiIiIiIiIiDiMGkIiIiIiIiIiIg6jhpCIiIiIiIiIiMOoISQiIiIiIiIi4jBqCImIiIiIiIiIOIwaQiIiIiIiIiIiDqOGkIiIiIiIiIiIw6ghJCIiIiIiIiLiMGoIiYiI+KjVq1czbNgwatasSUhICBEREbRs2ZIXXniBpKQkW2r64IMPcLlcLFu2rFhfZ+fOnbhcrvwvPz8/ypUrR/fu3Zk7d+45Hzdnzhz69OlDhQoVCA4OpmrVqgwdOpT169ef8zG//vor119/PfHx8QQFBREVFUX79u154403SEtLK463V2Rq1KjBLbfcUuKv27VrV8u/T2BgIDVq1GD48OHs2rWrxOsRERERTwF2FyAiIiKF98477zBy5Ejq16/Pgw8+SKNGjcjJyWHZsmW8+eabLFmyhBkzZthdZrG7++67+cc//kFeXh4bN25k3Lhx9O7dm/nz59O5c2fLvg899BATJ07kqquu4vXXXycuLo7Nmzfz0ksv0bJlSz777DP69+9vecwTTzzBU089Rfv27Rk/fjy1a9cmPT2dxYsX8+STT7J582YmT55ckm+5UGbMmEFkZKQtr12rVi0+/fRTALKzs1m7di3jxo1j3rx5bNy4kbCwMFvqEhEREZMaQiIiIj5myZIl/Otf/6JHjx7MnDmT4ODg/Pt69OjB/fffz5w5c0q0ppycHFwuV4m+JkC1atW47LLLAOjQoQN169alS5cuvPvuu5aG0JQpU5g4cSL/+te/eP311/PHO3fuzJAhQ+jSpQs333wzzZs3p1atWgBMmzaNp556iuHDh/POO+9Y3l+vXr146KGHWLJkSQm90wvTokUL2147NDQ0/98GzL/rkJAQhg8fzqJFi+jZs6dttYmIiIguGRMREfE5zz77LC6Xi7ffftvSDDopKCiIa665Jn/b7Xbzwgsv0KBBA4KDg4mNjeWf//wne/futTzuXJcXde3ala5du+ZvL1iwAJfLxccff8z9999PfHw8wcHBbN26NX+fo0ePMmzYMKKjowkPD+fqq69m+/btHs/9448/0r17dyIjIwkLC6NDhw789NNPF/C3YmrdujUABw8etIw/88wzlCtXjkmTJnk8Jjw8nFdffZX09HTL2T5PPfUU5cqV45VXXjlrs6tMmTJF2tTo2rUrTZo0YenSpXTq1ImwsDBq1arFc889h9vtzt8vMzOT+++/n+bNmxMVFUV0dDTt2rXj66+/9njO0/9NDx8+TFBQEI899pjHfhs3bsTlcvHKK6/kjyUkJHDHHXdQpUoVgoKCqFmzJuPGjSM3N/eC32NUVBQAgYGB+WNbt25l2LBh1K1bl7CwMOLj47n66qtZs2ZN/j6pqamULVuWO+64w+M5d+7cib+/PxMnTix07W+88QbNmjUjIiKCMmXK0KBBA8aMGXPB709ERMSXqCEkIiLiQ/Ly8pg/fz6tWrWiatWqBXrMv/71Lx5++GF69OjBN998w/jx45kzZw7t27cnMTHxgmsZPXo0u3fv5s033+Tbb78lNjY2/77hw4fj5+fHZ599xssvv8yff/5J165dOXbsWP4+n3zyCT179iQyMpIPP/yQL774gujoaK688soLbgrt2LEDgHr16uWPHThwgHXr1tGzZ89zXqbUrl07YmNjmTdvXv5j1q5de97HFMTJ5tmTTz5ZoP0TEhK48cYbuemmm/jmm2/o1asXo0eP5pNPPsnfJysri6SkJB544AFmzpzJlClT6NixI/379+ejjz4653NXqFCBvn378uGHH1oaTADvv/8+QUFB3Hjjjfl1tGnThh9++IHHH3+c77//nuHDhzNhwgRuu+22Ar//3NxccnNzSU9P588//+Spp56iVq1atG/fPn+f/fv3U758eZ577jnmzJnD//73PwICAmjbti2bNm0CICIigltvvZVPP/2U5ORky2u8/vrrBAUFceuttxaq9s8//5yRI0fSpUsXZsyYwcyZMxk1apTXzwslIiJSZAwRERHxGQkJCQZg3HDDDQXaf8OGDQZgjBw50jL+xx9/GIAxZsyY/LHq1asbQ4cO9XiOLl26GF26dMnf/vnnnw3A6Ny5s8e+77//vgEY1113nWX8t99+MwDj6aefNgzDMNLS0ozo6Gjj6quvtuyXl5dnNGvWzGjTps1539eOHTsMwHj++eeNnJwcIzMz0/jrr7+Mdu3aGZUqVTJ27NiRv+/vv/9uAMYjjzxy3uds27atERoaWqjH/J0FCxYY/v7+xrhx4/523y5duhiA8ccff1jGGzVqZFx55ZXnfFxubq6Rk5NjDB8+3GjRooXlvjP/Tb/55hsDMObOnWt5fOXKlY0BAwbkj91xxx1GRESEsWvXLsvzTZo0yQCMdevWFei9nPlVr149Y8OGDed9bG5urpGdnW3UrVvXGDVqVP74tm3bDD8/P2Py5Mn5YxkZGUb58uWNYcOGFbr2u+66yyhbtux5axERESnNdIaQiIhIKfbzzz8DeFwK1qZNGxo2bHhRl2cNGDDgnPedPNPkpPbt21O9evX8ehYvXkxSUhJDhw7NP4skNzcXt9vNVVddxdKlSwt0psbDDz9MYGAgISEhNG/enLVr1/Ltt99So0aNQr8fwzCKfB6kLl26kJuby+OPP16g/StWrEibNm0sY02bNvVYmWvatGl06NCBiIgIAgICCAwM5N1332XDhg3nff5evXpRsWJF3n///fyxH374gf379+efYQPw3Xff0a1bNypXrmz59+nVqxcAv/zyy9++l9q1a7N06VKWLl3KkiVL+OyzzwgNDaV79+5s2bIlf7/c3FyeffZZGjVqRFBQEAEBAQQFBbFlyxbL+6lVqxZ9+/bl9ddfxzAMAD777DOOHDnCXXfdVeja27Rpw7FjxxgyZAhff/31RZ0tJyIi4ovUEBIREfEhMTExhIWF5V8a9XeOHDkCQKVKlTzuq1y5cv79F+Jsz3lSxYoVzzp28vVOzvEzcOBAAgMDLV/PP/88hmGQlJT0tzX85z//YenSpSxatIhJkyaRk5PDtddea3lf1apVA/jbv7Ndu3blX4ZX0McUtfLly3uMBQcHk5GRkb89ffp0rr/+euLj4/nkk09YsmQJS5cu5dZbbyUzM/O8zx8QEMDNN9/MjBkz8i/f++CDD6hUqRJXXnll/n4HDx7k22+/9fi3ady4MUCBmichISG0bt2a1q1bc9lllzFkyBC+//57Dhw4YGmQ3XfffTz22GP069ePb7/9lj/++IOlS5fSrFkzy/sG8997y5Yt+Zf2/e9//6Ndu3a0bNmy0LXffPPNvPfee+zatYsBAwYQGxtL27Zt859bRESktNMqYyIiIj7E39+f7t278/3337N3716qVKly3v1PNhgOHDjgse/+/fuJiYnJ3w4JCSErK8vjORITEy37nXS+s2kSEhLOOlanTh2A/Od79dVXLStRnS4uLu6cz39SlSpV8ieS7tChAxUrVuSmm27iiSee4LXXXgPMxlXjxo2ZO3cu6enpZ50TaMmSJRw8eJBBgwblP+aSSy4572Ps8sknn1CzZk2mTp1q+Tc427/d2QwbNoyJEyfy+eefM3jwYL755hvuvfde/P398/eJiYmhadOmPPPMM2d9jsqVK19Q7ZUqVSImJoZVq1ZZ3s8///lPnn32Wcu+iYmJlC1b1jJ2+eWX06RJE1577TUiIiJYsWKFZX6lwtY+bNgwhg0bRlpaGgsXLuSJJ56gb9++bN68mer
"text/plain": [
"<Figure size 1400x800 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ROC curve\n",
"\n",
"# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)\n",
"y_pred_prob = clf.predict_proba(X_test)[:, 1]\n",
"\n",
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)\n",
"\n",
"# Calcul de l'aire sous la courbe ROC (AUC)\n",
"roc_auc = auc(fpr, tpr)\n",
"\n",
"plt.figure(figsize = (14, 8))\n",
"plt.plot(fpr, tpr, label=\"ROC curve(area = %0.3f)\" % roc_auc)\n",
"plt.plot([0, 1], [0, 1], color=\"red\",label=\"Random Baseline\", linestyle=\"--\")\n",
"plt.grid(color='gray', linestyle='--', linewidth=0.5)\n",
"plt.xlabel('Taux de faux positifs (FPR)')\n",
"plt.ylabel('Taux de vrais positifs (TPR)')\n",
"plt.title('Courbe ROC : naive Bayes')\n",
"plt.legend(loc=\"lower right\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ad1a0b57-e382-4ae3-90b6-1f790099711b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/mamba/lib/python3.11/site-packages/numpy/core/fromnumeric.py:86: FutureWarning: The behavior of DataFrame.sum with axis=None is deprecated, in a future version this will reduce over both axes and return a scalar. To retain the old behavior, pass axis=0 (or do not pass axis)\n",
" return reduction(axis=axis, out=out, **passkwargs)\n"
]
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAHFCAYAAAAOmtghAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABtrklEQVR4nO3deVhU1f8H8PewzLCDgCCbLO64C6lg7nvmlqWmuWeZmbuW+SvTLMvSTL+5lLtZmmtZ5q644YKCG4iKCKggArLvM+f3BzI5gjqDMwwM79fz8DzOmbu85wrcD/eee45ECCFAREREZCCM9B2AiIiISJtY3BAREZFBYXFDREREBoXFDRERERkUFjdERERkUFjcEBERkUFhcUNEREQGhcUNERERGRQWN0RERGRQWNwQVUGXL1/GqFGj4O3tDTMzM1hZWaFFixZYuHAhUlJS9JJp/fr1kEgkCAkJ0cv+jx07BolEgmPHjmlle3fu3IFEIsH333//3OW8vLwwcuRIlbbQ0FC0b98etra2kEgkWLJkCfbu3YsvvvhCK9mIDJ2JvgMQUfn65ZdfMH78eNSrVw8zZsyAr68vCgoKEBISgpUrVyI4OBi7du3Sd8wqY9euXbCxsVFpGz16NLKysrBlyxZUq1YNXl5emD9/Pn766ScWOERqYHFDVIUEBwfjgw8+QNeuXbF7927IZDLle127dsW0adOwb9++cs1UUFAAiURSrvusSJo3b16i7erVqxg7dix69uyph0RElR9vSxFVIV9//TUkEgl+/vlnlcKmmFQqRZ8+fZSvFQoFFi5ciPr160Mmk8HJyQnDhw/H3bt3VdYr7dYKAHTo0AEdOnRQvi6+9bNp0yZMmzYNbm5ukMlkuHXrlnKZR48eYdSoUbC3t4elpSV69+6N27dvl9j2oUOH0LlzZ9jY2MDCwgJt2rTB4cOH1ToO169fR48ePWBhYQFHR0eMGzcOGRkZpS77MvtRx5PHrvjWXGFhIVasWAGJRAKJRIKRI0fip59+AgBlm0QiwZ07d7SWg8iQsLghqiLkcjmOHDkCPz8/eHh4qLXOBx98gI8//hhdu3bFX3/9hS+//BL79u1DYGAgkpKSypxl1qxZiI2NxcqVK7Fnzx44OTkp3xszZgyMjIzw22+/YcmSJTh37hw6dOiA1NRU5TK//vorunXrBhsbG2zYsAF//PEH7O3t0b179xcWHg8ePED79u1x9epVLF++HJs2bUJmZiYmTJhQYtmX2U9Z9OrVC8HBwQCAN998E8HBwQgODsZnn32GN998EwCUbcHBwXBxcdF6BiKDIIioSkhISBAAxODBg9VaPiIiQgAQ48ePV2k/e/asACA+/fRTZZunp6cYMWJEiW20b99etG/fXvn66NGjAoBo165diWXXrVsnAIj+/furtJ86dUoAEPPnzxdCCJGVlSXs7e1F7969VZaTy+WiadOmomXLls/9XB9//LGQSCQiLCxMpb1r164CgDh69KhW9hMdHS0AiO++++65y5V27ACIDz/8UKXtww8/FPyVTaQeXrkholIdPXoUAErcbmrZsiUaNGjwUlcuBgwY8Mz3hg4dqvI6MDAQnp6eyjynT59GSkoKRowYgcLCQuWXQqFAjx49cP78eWRlZT1z+0ePHkXDhg3RtGlTlfYhQ4aovH7Z/RCR/rBDMVEV4ejoCAsLC0RHR6u1fHJyMgCUeuvD1dUVMTExZc7yvNspNWrUKLWtOM+DBw8AQHmbpjQpKSmwtLQs9b3k5GR4e3u/cL8vux8i0h8WN0RVhLGxMTp37ox///0Xd+/ehbu7+3OXd3BwAADEx8eXWPb+/ftwdHRUvjYzM0NeXl6JbSQlJaksV+x5T0clJCSU2la7dm0AUG5v2bJlaN26danbcHZ2fub2HRwcnrmPJ73sfohIf3hbiqgKmTVrFoQQGDt2LPLz80u8X1BQgD179gAAOnXqBKCoU+2Tzp8/j4iICHTu3FnZ5uXlhcuXL6ssd+PGDURGRmqccfPmzSqvT58+jZiYGOVTV23atIGdnR3Cw8Ph7+9f6pdUKn3m9jt27Ihr167h0qVLKu2//fabyuuX3Y+2FT/dlpOTU277JKqseOWGqAoJCAjAihUrMH78ePj5+eGDDz5Aw4YNUVBQgNDQUPz8889o1KgRevfujXr16uG9997DsmXLYGRkhJ49e+LOnTv47LPP4OHhgSlTpii3O2zYMLzzzjsYP348BgwYgJiYGCxcuBDVq1fXOGNISAjeffddvPXWW4iLi8Ps2bPh5uaG8ePHAwCsrKywbNkyjBgxAikpKXjzzTfh5OSEhw8f4tKlS3j48CFWrFjxzO1PnjwZa9euRa9evTB//nw4Oztj8+bNuH79uspyL7ufYleuXMH27dtLtL/yyivw9PRU+7g0btwYAPDtt9+iZ8+eMDY2RpMmTcq1wCKqNPTdo5mIyl9YWJgYMWKEqFmzppBKpcLS0lI0b95cfP755yIxMVG5nFwuF99++62oW7euMDU1FY6OjuKdd94RcXFxKttTKBRi4cKFwsfHR5iZmQl/f39x5MiRZz4ttW3bthKZip+WOnDggBg2bJiws7MT5ubm4rXXXhM3b94ssXxQUJDo1auXsLe3F6ampsLNzU306tWr1G0/LTw8XHTt2lWYmZkJe3t7MWbMGPHnn3+qPC31svspflrqWV/r1q0TQqj/tFReXp549913RfXq1YVEIhEARHR09As/K1FVJBFCCP2UVURERETaxz43REREZFBY3BAREZFBYXFDREREBoXFDRERERkUFjdERERkUFjcEBERkUGpcoP4KRQK3L9/H9bW1s8dAp6IiIgqDiEEMjIy4OrqCiOj51+bqXLFzf379+Hh4aHvGERERFQGcXFxL5wbr8oVN9bW1gCKDo6NjY2e0xAREZE60tPT4eHhoTyPP0+VK26Kb0XZ2NiwuCEiIqpk1OlSwg7FREREZFBY3BAREZFBYXFDREREBqXK9blRl1wuR0FBgb5jUCViamoKY2NjfccgIqryWNw8RQiBhIQEpKam6jsKVUJ2dnaoUaMGx1AiItIjFjdPKS5snJycYGFhwZMUqUUIgezsbCQmJgIAXFxc9JyIiKjqYnHzBLlcrixsHBwc9B2HKhlzc3MAQGJiIpycnHiLiohIT9ih+AnFfWwsLCz0nIQqq+LvHfbXIiLSHxY3peCtKCorfu8QEekfixsiIiIyKHotbo4fP47evXvD1dUVEokEu3fvfuE6QUFB8PPzg5mZGXx8fLBy5UrdByW1qPt/+LKOHTsGiUTyUk+0ffHFF2jWrJny9ciRI9GvXz/layEE3nvvPdjb20MikSAsLKzM+yIiovKl1+ImKysLTZs2xf/+9z+1lo+OjsZrr72Gtm3bIjQ0FJ9++ikmTpyIHTt26Dhp5ZCQkICPPvoIPj4+kMlk8PDwQO/evXH48GF9R6vwfvzxR6xfv175et++fVi/fj3+/vtvxMfHo1GjRuVWvBER0cvR69NSPXv2RM+ePdVefuXKlahZsyaWLFkCAGjQoAFCQkLw/fffY8CAATpKWTncuXMHbdq0gZ2dHRYuXIgmTZqgoKAA+/fvx4cffojr16/rbN/5+fmQSqU62355sLW1VXkdFRUFFxcXBAYG6ikREVHloVAIZBfIkVsgR36hAgoh4F5Nfw/nVKpHwYODg9GtWzeVtu7du2PNmjUoKCiAqalpiXXy8vKQl5enfJ2enq7znPowfvx4SCQSnDt3DpaWlsr2hg0bYvTo0crXsbGx+Oijj3D48GEYGRmhR48eWLZsGZydnQEU3Z5JTU1VuUIxefJkhIWF4dixYwCADh06oFGjRpBKpdi4cSMaNmyIoKAgAEB8fDx69uyJY8eOoUaNGli4cCHeeust5bbu3buHqVOn4sCBAzAyMsKrr76KH3/8EV5eXs/8bHv37sXkyZMRFxeH1q1bY8SIESWWOX36ND755BOcP38ejo6O6N+/PxYsWKByLJ7nyc89cuRIbNiwAUDRrTZPT0/lcv379wcAeHp64s6dO2ptm4joeYQQkCsEFAJQCAHF49dyhUD
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# utilisation d'une métrique plus adaptée aux modèles de marketing : courbe de lift\n",
"\n",
"# Tri des prédictions de probabilités et des vraies valeurs\n",
"sorted_indices = np.argsort(y_pred_prob)[::-1]\n",
"y_pred_prob_sorted = y_pred_prob[sorted_indices]\n",
"y_test_sorted = y_test.iloc[sorted_indices]\n",
"\n",
"# Calcul du gain cumulatif\n",
"cumulative_gain = np.cumsum(y_test_sorted) / np.sum(y_test_sorted)\n",
"\n",
"# Tracé de la courbe de lift\n",
"plt.plot(np.linspace(0, 1, len(cumulative_gain)), cumulative_gain, label='Courbe de lift')\n",
"plt.xlabel('Part de clients identifiés sans modèle ')\n",
"plt.ylabel('Part de clients identifiés avec modèle')\n",
"plt.title('Courbe de Lift')\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "7cbb1fec-97b9-4780-9488-5b8eff5aee0d",
"metadata": {},
"source": [
"## From model to segmentation"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "d97ca3df-3778-469c-a077-495b3ee25051",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([9.0362e+04, 2.7200e+02, 1.6700e+02, 1.0000e+02, 8.6000e+01,\n",
" 5.7000e+01, 6.6000e+01, 6.3000e+01, 4.5000e+01, 5.1000e+01,\n",
" 5.4000e+01, 3.6000e+01, 5.3000e+01, 5.3000e+01, 5.3000e+01,\n",
" 5.1000e+01, 7.7000e+01, 1.1800e+02, 1.2700e+02, 4.2050e+03]),\n",
" array([8.76852176e-09, 5.00000083e-02, 1.00000008e-01, 1.50000007e-01,\n",
" 2.00000007e-01, 2.50000007e-01, 3.00000006e-01, 3.50000006e-01,\n",
" 4.00000005e-01, 4.50000005e-01, 5.00000004e-01, 5.50000004e-01,\n",
" 6.00000004e-01, 6.50000003e-01, 7.00000003e-01, 7.50000002e-01,\n",
" 8.00000002e-01, 8.50000001e-01, 9.00000001e-01, 9.50000000e-01,\n",
" 1.00000000e+00]),\n",
" <BarContainer object of 20 artists>)"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjoAAAGdCAYAAAAbudkLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAj20lEQVR4nO3da3CU5d3H8V/IiZBJViAkYSVV6EQkBpUGDQlamAIJlZBxeoA2dCstAk6UECVyGKyC0yZyEK1GESgVy8EwFWmdAjFpa1MiRyNpDaB2BCWUhKAsmwDpJob7eeFwP11CkY0mcS+/n5l9kXv/u3vtNeh+ubO7BFmWZQkAAMBAPbp7AQAAAJ2F0AEAAMYidAAAgLEIHQAAYCxCBwAAGIvQAQAAxiJ0AACAsQgdAABgrJDuXkB3unDhgk6cOKGoqCgFBQV193IAAMBVsCxLTU1Ncjqd6tHjyudsvtahc+LECSUkJHT3MgAAQAfU1tZqwIABV5z5WodOVFSUpM82Kjo6uptXAwAArkZjY6MSEhLs1/Er+VqHzsVfV0VHRxM6AAAEmKt52wlvRgYAAMYidAAAgLEIHQAAYCxCBwAAGIvQAQAAxiJ0AACAsQgdAABgLEIHAAAYi9ABAADGInQAAICxCB0AAGAsQgcAABiL0AEAAMYidAAAgLFCunsBJrt+/rZOud8Pn5jQKfcLAIBpOKMDAACMRegAAABjEToAAMBYhA4AADAWoQMAAIxF6AAAAGMROgAAwFiEDgAAMBahAwAAjEXoAAAAYxE6AADAWIQOAAAwFqEDAACMRegAAABjEToAAMBYhA4AADAWoQMAAIxF6AAAAGMROgAAwFiEDgAAMBahAwAAjEXoAAAAYxE6AADAWIQOAAAwFqEDAACMRegAAABjEToAAMBYhA4AADAWoQMAAIxF6AAAAGMROgAAwFiEDgAAMBahAwAAjEXoAAAAYxE6AADAWH6FzqeffqpHHnlEAwcOVEREhAYNGqTHH39cFy5csGcsy9KiRYvkdDoVERGh0aNH6+DBgz734/V6NWvWLMXExCgyMlLZ2dk6fvy4z4zb7ZbL5ZLD4ZDD4ZDL5dKZM2d8Zo4dO6aJEycqMjJSMTExysvLU0tLi59bAAAATOVX6CxZskQvvPCCiouLdfjwYS1dulTLli3Ts88+a88sXbpUK1asUHFxsfbv36/4+HiNGzdOTU1N9kx+fr62bt2qkpISVVZW6uzZs8rKylJbW5s9k5OTo+rqapWWlqq0tFTV1dVyuVz29W1tbZowYYLOnTunyspKlZSUaMuWLZozZ84X2Q8AAGCQIMuyrKsdzsrKUlxcnNauXWsf+/73v69evXpp/fr1sixLTqdT+fn5mjdvnqTPzt7ExcVpyZIlmjlzpjwej/r166f169dr8uTJkqQTJ04oISFB27dvV2Zmpg4fPqykpCTt2bNHqampkqQ9e/YoLS1N7777rgYPHqwdO3YoKytLtbW1cjqdkqSSkhJNnTpVDQ0Nio6O/tzn09jYKIfDIY/Hc1Xz/rp+/rYv/T4l6cMnJnTK/QIAEAj8ef3264zOHXfcob/85S96//33JUn/+Mc/VFlZqbvuukuSdPToUdXX1ysjI8O+TXh4uEaNGqVdu3ZJkqqqqtTa2uoz43Q6lZycbM/s3r1bDofDjhxJGjFihBwOh89McnKyHTmSlJmZKa/Xq6qqKn+eFgAAMFSIP8Pz5s2Tx+PRjTfeqODgYLW1telXv/qVfvzjH0uS6uvrJUlxcXE+t4uLi9NHH31kz4SFhal3797tZi7evr6+XrGxse0ePzY21mfm0sfp3bu3wsLC7JlLeb1eeb1e++fGxsarfu4AACDw+HVGZ/PmzdqwYYM2bdqkt99+Wy+99JKWL1+ul156yWcuKCjI52fLstodu9SlM5eb78jMfysqKrLf3OxwOJSQkHDFNQEAgMDmV+g8/PDDmj9/vn70ox9p6NChcrlcevDBB1VUVCRJio+Pl6R2Z1QaGhrssy/x8fFqaWmR2+2+4szJkyfbPf6pU6d8Zi59HLfbrdbW1nZnei5asGCBPB6PfamtrfXn6QMAgADjV+icP39ePXr43iQ4ONj+ePnAgQMVHx+v8vJy+/qWlhZVVFQoPT1dkpSSkqLQ0FCfmbq6OtXU1NgzaWlp8ng82rdvnz2zd+9eeTwen5mamhrV1dXZM2VlZQoPD1dKSspl1x8eHq7o6GifCwAAMJdf79GZOHGifvWrX+kb3/iGbrrpJh04cEArVqzQz3/+c0mf/SopPz9fhYWFSkxMVGJiogoLC9WrVy/l5ORIkhwOh6ZNm6Y5c+aob9++6tOnjwoKCjR06FCNHTtWkjRkyBCNHz9e06dP16pVqyRJM2bMUFZWlgYPHixJysjIUFJSklwul5YtW6bTp0+roKBA06dPJ2AAAIAkP0Pn2Wef1S9+8Qvl5uaqoaFBTqdTM2fO1KOPPmrPzJ07V83NzcrNzZXb7VZqaqrKysoUFRVlzzz11FMKCQnRpEmT1NzcrDFjxmjdunUKDg62ZzZu3Ki8vDz701nZ2dkqLi62rw8ODta2bduUm5urkSNHKiIiQjk5OVq+fHmHNwMAAJjFr+/RMQ3fowMAQODptO/RAQAACCSEDgAAMBahAwAAjEXoAAAAYxE6AADAWIQOAAAwFqEDAACMRegAAABjEToAAMBYhA4AADAWoQMAAIxF6AAAAGMROgAAwFiEDgAAMBahAwAAjEXoAAAAYxE6AADAWIQOAAAwFqEDAACMRegAAABjEToAAMBYhA4AADAWoQMAAIxF6AAAAGMROgAAwFiEDgAAMBahAwAAjEXoAAAAYxE6AADAWIQOAAAwFqEDAACMRegAAABjEToAAMBYhA4AADAWoQMAAIxF6AAAAGMROgAAwFiEDgAAMBahAwAAjEXoAAAAYxE6AADAWIQOAAAwFqEDAACMRegAAABjEToAAMBYhA4AADAWoQMAAIxF6AAAAGMROgAAwFiEDgAAMBahAwAAjEXoAAAAYxE6AADAWIQOAAAwFqEDAACMRegAAABjEToAAMBYhA4AADAWoQMAAIxF6AAAAGMROgAAwFiEDgAAMBahAwAAjEXoAAAAYxE6AADAWIQOAAAwFqEDAACM5Xfo/Pvf/9ZPfvIT9e3bV7169dKtt96qqqoq+3rLsrRo0SI5nU5FRERo9OjROnjwoM99eL1ezZo1SzExMYqMjFR2draOHz/uM+N2u+VyueRwOORwOORyuXTmzBmfmWPHjmnixImKjIxUTEyM8vLy1NLS4u9TAgAAhvIrdNxut0aOHKnQ0FDt2LFDhw4d0pNPPqlrrrnGnlm6dKlWrFih4uJi7d+/X/Hx8Ro3bpyamprsmfz8fG3dulUlJSWqrKzU2bNnlZWVpba2NnsmJydH1dXVKi0tVWlpqaqrq+Vyuezr29raNGHCBJ07d06VlZUqKSnRli1bNGfOnC+wHQAAwCRBlmVZVzs8f/58vfnmm9q5c+dlr7csS06nU/n5+Zo3b56kz87exMXFacmSJZo5c6Y8Ho/69eun9evXa/LkyZKkEydOKCEhQdu3b1dmZqYOHz6spKQk7dmzR6mpqZKkPXv2KC0tTe+++64GDx6sHTt2KCsrS7W1tXI6nZKkkpISTZ06VQ0NDYqOjv7c59PY2CiHwyGPx3NV8/66fv62L/0+JenDJyZ0yv0CABAI/Hn99uuMzmuvvabhw4frhz/8oWJjYzVs2DCtWbPGvv7o0aOqr69XRkaGfSw8PFyjRo3Srl27JElVVVVqbW31mXE6nUpOTrZndu/eLYfDYUeOJI0YMUIOh8NnJjk52Y4cScrMzJTX6/X5VRoAAPj68it0jhw5opUrVyoxMVGvv/667rvvPuXl5el3v/udJKm+vl6SFBcX53O7uLg4+7r6+nqFhYWpd+/eV5yJjY1t9/ixsbE+M5c+Tu/evRUWFmbPXMrr9aqxsdHnAgAAzBXiz/CFCxc0fPhwFRYWSpKGDRumgwcPauXKlfrpT39qzwUFBfnczrKsdscudenM5eY7MvPfioqKtHjx4iuuAwA
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.hist(y_pred_prob, bins=20)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "b4ae4508-d5ac-4b22-a546-6c724278f8c3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([8.76852176e-09, 8.76852176e-09, 8.76852176e-09, ...,\n",
" 1.00000000e+00, 1.00000000e+00, 1.00000000e+00])"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.sort(y_pred_prob)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "ace9c778-0ab4-4e28-8ca0-364040d122e6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4527"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"(y_pred_prob>0.8).sum()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "4a202a7e-e7fe-479c-8be3-7b2b93fe9d7b",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAqgAAAHFCAYAAAA+OgtFAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABT90lEQVR4nO3deXwTdf7H8XfatGkptByFAqW0BVQKyGER5RK5ioioP0EQQQ5BAQ8Q0BVEBVGXBZFlUQFdRRYFrCCyHghULnFhlVNFdD2QSyg3tHL2+P7+gMSmSSGF0hx9PR+PPGi++c7kMzPJzJuZyYzFGGMEAAAA+IggbxcAAAAA5EVABQAAgE8hoAIAAMCnEFABAADgUwioAAAA8CkEVAAAAPgUAioAAAB8CgEVAAAAPoWACgAAAJ9ySQH122+/Vb9+/ZSYmKiwsDCVLl1a1113nSZOnKgjR444+t18882yWCyyWCwKCgpSmTJlVKtWLd19991asGCBcnNzXcadkJDgGCb/448//rj0KQ1gmzdvVqtWrRQVFSWLxaIpU6a47Xfy5EmNHTtWq1atcnlt7NixslgsOnTo0JUt1gfs2LFDFotFs2bNcrTZp7+w5s6dW+D8vlwJCQnq27fvFRn3pUpISNBtt91WpOO0WCx65JFHLtpv1apVslgsTp9fd8vt5ptv1s033+x4fqHPfXHx9Dvqi2bNmiWLxaINGzZ4u5QLuv/++3XLLbd4u4zL8tZbbyk2NlYnTpzweJinn35a1atXl9VqVdmyZa9ccT5o7969Gjt2rLZs2eLtUgJS3759lZCQ4LX3txZ2gH/+85966KGHdM011+iJJ55QnTp1lJWVpQ0bNmjGjBlat26dPvzwQ0f/GjVqaM6cOZKkEydO6LffftOiRYt09913q2XLlvr4448VFRXl9B7NmzfXpEmTXN67VKlShS23RLj//vt14sQJvffeeypXrlyBH6iTJ0/queeekySnDTikAQMGXNLGbe7cudq6dasee+yxoi8KTq677jqtW7dOderUuWC/adOmOT33hc+9p99RXJrNmzfrX//6l7766itvl3JZ+vTpowkTJmjixImOz+yF/Pvf/9aLL76o0aNHq2PHjrLZbMVQpe/Yu3evnnvuOSUkJKhhw4beLifgPPPMMxo6dKjX3r9QAXXdunUaPHiw2rdvr0WLFjl9Gdq3b68RI0ZoyZIlTsOEh4frxhtvdGobMGCA3n77bd1///168MEHlZqa6vR62bJlXYbxBSdPnvTJkLx161Y98MAD6tixo7dLueJOnTql8PDwIh9vtWrVVK1atSIfr6/z1c+0O5GRkR6tFy4WYL3B176jxhidPn36inyXvOFvf/ubmjRposaNG3u7FCdZWVmyWCyyWj3b1FqtVg0cOFDPP/+8nnzyyYt+N7du3SpJGjJkiCpVqnTZ9Ur+sU7IyclRdna2t8sIeDVr1vTq+xfqEP9f//pXWSwWvfHGG27/pxYaGqrbb7/do3H169dPt956q+bPn6+dO3cWpowCHTt2TCNGjFCNGjVks9lUqVIl3Xrrrfrxxx8luT9EKLk/5Nu3b1+VLl1a3333nVJSUlSmTBm1bdtWjz32mCIiIpSRkeHy/t27d1dMTIyysrIcbampqWratKkiIiJUunRpdejQQZs3b/ZoerZu3ao77rhD5cqVU1hYmBo2bKh//etfjtfth96ys7M1ffp0x6kQ7uzYsUMVK1aUJD333HOOvvkPIe/fv189evRQVFSUYmJidP/99+v48eNOfYwxmjZtmho2bKjw8HCVK1dOXbt21fbt2y86TfZDsps3b9Zdd92lyMhIRUVFqVevXjp48KBTX/vh5IULF6pRo0YKCwtz7FVIT0/XwIEDVa1aNYWGhioxMVHPPfecy0pr79696tatm8qUKaOoqCh1795d6enpBdaV39y5c9W0aVOVLl1apUuXVsOGDfXWW29JOrc37tNPP9XOnTudTkWxO3v2rF544QXVrl1bNptNFStWVL9+/VymMysrS3/5y19UuXJllSpVSi1atNDXX3990Xkp/fnZnThxol588UVVr15dYWFhaty4sZYvX+52Gjdt2qSuXbuqXLlyjhXQ6dOnNWrUKCUmJio0NFSxsbF6+OGHdezYMbfv++GHH6p+/foKCwtTjRo1NHXqVKfXT58+rREjRqhhw4aKiopS+fLl1bRpU/373/8ucFpef/11XX311bLZbKpTp47ee+89p9cL+v7ml/cQ/4U+92vWrJHFYtG8efNcxjF79mxZLBatX7/+gu9VlN9Re72eLs+CDr+5+yzbT6OYMWOGkpKSZLPZHHX++OOP6tGjh2JiYmSz2VS9enX17t1bZ86ccRpHZmamBg8erOjoaFWoUEF33XWX9u7d69QnNTVVKSkpqlKlisLDw5WUlKSRI0e6HLLevn277rnnHlWtWlU2m00xMTFq27aty6FaT9af+/fv14cffqj77rvPqT03N1cvvPCCrrnmGoWHh6ts2bKqX7++/vGPfzj182T6L7acpT8/n++8845GjBih2NhY2Ww2/fLLL5Kkzz//XG3btlVkZKRKlSql5s2buyxTSerZs6cyMjJcPv/5JSQk6Omnn5YkxcTEyGKxaOzYsY5pnzhxomPdU6lSJfXu3Vt79uxxGsfNN9+sevXq6YsvvlCzZs1UqlQp3X///QW+pyfLzb7evtg6QpJ27dqlXr16qVKlSrLZbEpKStLLL7/sdPpf3u/ECy+8oMTERNlsNq1cuVLXX3+9pHN5wv7dss8DTz9j7nz11Vfq3LmzKlSooLCwMNWsWdPlKNmXX36ptm3bqkyZMipVqpSaNWumTz/91KmP/fu/YsUKPfDAA6pQoYIiIyPVu3dvnThxQunp6erWrZvKli2rKlWq6PHHH3fKD4VZH/zyyy/q16+frrrqKpUqVUqxsbHq3LmzvvvuO6d+9s/pvHnzNHr0aFWtWlWRkZFq166d/ve//zn1dbeO8XT7v3nzZt12222OZVu1alV16tTJ5TN4QcZD2dnZplSpUuaGG27wdBDTqlUrU7du3QJfnzFjhpFk3nnnHUdbfHy8ufXWW01WVpbTIycn54LvlZGRYerWrWsiIiLMuHHjzNKlS80HH3xghg4dalasWGGMMWblypVGklm5cqXTsL/99puRZN5++21HW58+fUxISIhJSEgw48ePN8uXLzdLly4133zzjZFk/vnPfzqN4+jRo8Zms5nhw4c72l588UVjsVjM/fffbz755BOzcOFC07RpUxMREWG+//77C07Pjz/+aMqUKWNq1qxpZs+ebT799FPTo0cPI8lMmDDBGGPMgQMHzLp164wk07VrV7Nu3Tqzbt06t+M7ffq0WbJkiZFk+vfv7+j7yy+/GGOMGTNmjJFkrrnmGvPss8+atLQ0M3nyZGOz2Uy/fv2cxvXAAw+YkJAQM2LECLNkyRIzd+5cU7t2bRMTE2PS09MvOF3294mPjzdPPPGEWbp0qZk8ebKJiIgwjRo1MmfPnnX0jY+PN1WqVDE1atQwM2fONCtXrjRff/212bdvn4mLizPx8fHm9ddfN59//rl5/vnnjc1mM3379nUMf/LkSZOUlGSioqLMK6+8YpYuXWqGDBliqlev7rK87XXl9cwzzxhJ5q677jLz5883y5YtM5MnTzbPPPOMMcaY77//3jRv3txUrlzZMT/t8z8nJ8fccsstJiIiwjz33HMmLS3NvPnmmyY2NtbUqVPHnDx50vE+ffr0MRaLxTzxxBOO94iNjTWRkZGmT58+F5yf9s9uXFycadGihfnggw/M/PnzzfXXX29CQkLM2rVr3c77J5980qSlpZlFixaZ3Nxc06FDB2O1Ws0zzzxjli1bZiZNmuRYJqdPn3ZaJrGxsaZ69epm5syZZvHixaZnz55GknnppZcc/Y4dO2b69u1r3nnnHbNixQqzZMkS8/jjj5ugoCDzr3/9y2ka7PXXqVPHzJs3z3z00UfmlltuMZLM/PnzHf3cfX/dLbdWrVqZVq1aGWMu/rlv1KiRad68uct8vf766831119/wXlf1N9RYwq
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# number of observations\n",
"N = len(y_pred_prob)\n",
"\n",
"# sort the data in ascending order \n",
"y_pred_prob_sorted = np.sort(y_pred_prob) \n",
"\n",
"# get the cdf values of y \n",
"steps = np.arange(N) / N\n",
" \n",
"# plotting \n",
"plt.xlabel('X') \n",
"plt.ylabel('P(score<=X)') \n",
" \n",
"plt.title('CDF curve of the predicted probability of purchasec(score) for sports companies') \n",
" \n",
"plt.plot(y_pred_prob_sorted, steps) \n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "e87efb96-71e6-4571-9a48-576ff5ebcbdc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0. , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,\n",
" 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1. ])"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# on regarde de plus près les quantiles (on identifie 2 clusters, où est le cut-off ?)\n",
"\n",
"np.linspace(0,1, 21)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "ccd8373c-85c4-451d-b918-7bb84713c9ea",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(90634,)"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_pred_prob_sorted[y_pred_prob < 0.1].shape"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "75a2c582-3020-4e2e-9a41-0da75c5dbbed",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score du quantile 0.0 : 1.0\n",
"score du quantile 0.05 : 1.1703610048497538e-08\n",
"score du quantile 0.1 : 1.1916538583855572e-08\n",
"score du quantile 0.15000000000000002 : 1.672960453020865e-08\n",
"score du quantile 0.2 : 2.261530896018714e-08\n",
"score du quantile 0.25 : 4.429426100901144e-08\n",
"score du quantile 0.30000000000000004 : 5.527720441770875e-08\n",
"score du quantile 0.35000000000000003 : 6.583003552085313e-08\n",
"score du quantile 0.4 : 1.0150014636815537e-07\n",
"score du quantile 0.45 : 1.045553983975125e-07\n",
"score du quantile 0.5 : 1.8254643649033717e-07\n",
"score du quantile 0.55 : 1.0036337913333724e-06\n",
"score du quantile 0.6000000000000001 : 3.6006418270834777e-06\n",
"score du quantile 0.65 : 8.750051427856617e-06\n",
"score du quantile 0.7000000000000001 : 1.7761176996762073e-05\n",
"score du quantile 0.75 : 3.658511676930477e-05\n",
"score du quantile 0.8 : 7.449089979671675e-05\n",
"score du quantile 0.8500000000000001 : 0.0001599334998042523\n",
"score du quantile 0.9 : 0.0006156933309033692\n",
"score du quantile 0.9500000000000001 : 0.5161846499348189\n",
"score du quantile 1.0 : 1.0\n"
]
}
],
"source": [
"for step in np.linspace(0,1, 21) :\n",
" score_reached = y_pred_prob_sorted[int(step*N)-1]\n",
" print(f\"score du quantile {step} : {score_reached}\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "3e7d04c4-1add-4ef3-bca5-c2f68356b669",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"score du quantile 0.94 : 0.046364832132301186\n",
"score du quantile 0.941 : 0.060426331367796585\n",
"score du quantile 0.942 : 0.07560789365683944\n",
"score du quantile 0.943 : 0.0961854989484283\n",
"score du quantile 0.944 : 0.12036366182214445\n",
"score du quantile 0.945 : 0.15326229828189683\n",
"score du quantile 0.946 : 0.20141929276940546\n",
"score du quantile 0.947 : 0.26129057078459816\n",
"score du quantile 0.948 : 0.34459110917836233\n",
"score du quantile 0.949 : 0.42441766527261676\n",
"score du quantile 0.95 : 0.5161846499348189\n",
"score du quantile 0.951 : 0.6281715747542238\n",
"score du quantile 0.952 : 0.7161294443763133\n",
"score du quantile 0.953 : 0.8098274658632696\n",
"score du quantile 0.954 : 0.8628210594682936\n",
"score du quantile 0.955 : 0.9031546758694196\n",
"score du quantile 0.956 : 0.9406325197642711\n",
"score du quantile 0.957 : 0.9717094630837765\n",
"score du quantile 0.958 : 0.9853416074407844\n",
"score du quantile 0.959 : 0.99263528504162\n",
"score du quantile 0.96 : 0.9965103675841931\n"
]
}
],
"source": [
"# le saut survient entre le quantile 0.94 et 0.955\n",
"# on peut prendre le quantile 0.95 / score = 0.52 comme cut-off approximatif\n",
"for step in np.linspace(0.94,0.96, 21) :\n",
" score_reached = y_pred_prob_sorted[int(step*N)-1]\n",
" print(f\"score du quantile {step} : {score_reached}\")"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "5d8bb4ea-0030-4d23-8cff-26c9ed54ca71",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-4 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-4 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-4 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-4 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-4 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-4 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-4 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-4 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-4 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-4 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-4 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-4 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-4 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-4 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-4 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-4 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-4 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-4 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-4 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-4\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>KMeans(n_clusters=2, random_state=0)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" checked><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;KMeans<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.cluster.KMeans.html\">?<span>Documentation for KMeans</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>KMeans(n_clusters=2, random_state=0)</pre></div> </div></div></div></div>"
],
"text/plain": [
"KMeans(n_clusters=2, random_state=0)"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# simple K-means pour déterminer le seuil qui sépare les 2 clusters apparents\n",
"\n",
"from sklearn.cluster import KMeans\n",
"\n",
"kmeans = KMeans(n_clusters=2, random_state=0)\n",
"\n",
"kmeans.fit(y_pred_prob.reshape(-1,1))"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "afbf8247-4cb1-455b-96df-7e9a87407413",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, ..., 0, 0, 0], dtype=int32)"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_clusters = kmeans.predict(y_pred_prob.reshape(-1,1))\n",
"y_clusters"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "e4747b82-1967-4043-bcd1-7659dbd87a2a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4846"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_clusters[y_clusters==1].size"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "2853083a-99a4-4ae9-9e8d-ddf175cca7ee",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9495712620712621"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 5% des individus sont dans le cluster 1\n",
"1 - y_clusters.mean()"
]
},
{
"cell_type": "markdown",
"id": "d18c8a4c-7d19-4d24-a304-cb26a533303e",
"metadata": {},
"source": [
"Intérêt du K-means : permet d'identifier un seuil de passage d'un cluster à l'autre quand le cluster est restreint, comme ici où on isole les clients avec la proba d'achat dans le quantile 0.95, et on les sépare des 95% restant"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "77f59f30-1dc6-43b8-98b7-d179a966786a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"part d'individus dans le cluster 0 : 0.9495712620712621\n",
"seuil de passage du cluster 0 au cluster 1 : 0.4855790414879801\n"
]
}
],
"source": [
"# seuil de split \n",
"\n",
"size_cluster_0 = 1 - y_clusters.mean()\n",
"seuil_cluster = y_pred_prob_sorted[int(1 - y_clusters.mean()*N)]\n",
"\n",
"print(f\"part d'individus dans le cluster 0 : {size_cluster_0}\")\n",
"print(f\"seuil de passage du cluster 0 au cluster 1 : {seuil_cluster}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}