2771 lines
387 KiB
Plaintext
2771 lines
387 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Segmentation des clients par régression logistique"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 39,
|
|||
|
"id": "bca785be-39f7-4583-9bd8-67c1134ae275",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import numpy as np\n",
|
|||
|
"import os\n",
|
|||
|
"import s3fs\n",
|
|||
|
"import re\n",
|
|||
|
"from sklearn.linear_model import LogisticRegression\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
|
|||
|
"import pickle"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"id": "8be4cda5-fd19-437f-bf23-9af20be537e9",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# import scipy\n",
|
|||
|
"import scikitplot as skplt"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"id": "14378e7b-240f-4df7-9ce8-5e60920a7729",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"'1.11.4'"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 4,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import scipy\n",
|
|||
|
"scipy.__version__ # il faut cette version pr eviter les pb"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"id": "3bf57816-b023-4e84-9450-095620bddebc",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Create filesystem object\n",
|
|||
|
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|||
|
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"id": "27002f2f-a78a-414c-8e4f-b15bf6dd9e40",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"/tmp/ipykernel_2186/1677066092.py:7: DtypeWarning: Columns (11,40) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|||
|
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
|||
|
"/tmp/ipykernel_2186/1677066092.py:12: DtypeWarning: Columns (40) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|||
|
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Importation des données\n",
|
|||
|
"BUCKET = \"projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach\"\n",
|
|||
|
"\n",
|
|||
|
"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_train.csv\"\n",
|
|||
|
"\n",
|
|||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|||
|
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
|||
|
"\n",
|
|||
|
"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_test.csv\"\n",
|
|||
|
"\n",
|
|||
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|||
|
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"id": "c3928b55-8821-46da-b3b5-a036efd6d2cf",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>event_type_id</th>\n",
|
|||
|
" <th>name_event_types</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>offre muséale individuel</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>spectacle vivant</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>5.0</td>\n",
|
|||
|
" <td>offre muséale groupe</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" event_type_id name_event_types\n",
|
|||
|
"0 2.0 offre muséale individuel\n",
|
|||
|
"1 4.0 spectacle vivant\n",
|
|||
|
"2 5.0 offre muséale groupe\n",
|
|||
|
"3 NaN NaN"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 7,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"dataset_train[['event_type_id', 'name_event_types']].drop_duplicates()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"id": "7e8a9d4d-7e55-4173-a7f4-8b8baa9610d2",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"#Choose type of event \n",
|
|||
|
"type_event_choosed = 5\n",
|
|||
|
"\n",
|
|||
|
"dataset_test = dataset_test[(dataset_test['event_type_id'] == type_event_choosed) | np.isnan(dataset_test['event_type_id'])]\n",
|
|||
|
"dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
|
|||
|
"dataset_train = dataset_train[(dataset_train['event_type_id'] == type_event_choosed) | np.isnan(dataset_train['event_type_id'])]\n",
|
|||
|
"dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"id": "e20ced8f-df1c-43bb-8d15-79f414c8225c",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"customer_id 0.000000\n",
|
|||
|
"event_type_id 0.967882\n",
|
|||
|
"nb_tickets 0.000000\n",
|
|||
|
"nb_purchases 0.000000\n",
|
|||
|
"total_amount 0.000000\n",
|
|||
|
"nb_suppliers 0.000000\n",
|
|||
|
"vente_internet_max 0.000000\n",
|
|||
|
"purchase_date_min 0.967882\n",
|
|||
|
"purchase_date_max 0.967882\n",
|
|||
|
"time_between_purchase 0.967882\n",
|
|||
|
"nb_tickets_internet 0.000000\n",
|
|||
|
"name_event_types 0.967882\n",
|
|||
|
"avg_amount 0.967882\n",
|
|||
|
"street_id 0.000000\n",
|
|||
|
"is_partner 0.000000\n",
|
|||
|
"gender 0.000000\n",
|
|||
|
"is_email_true 0.000000\n",
|
|||
|
"opt_in 0.000000\n",
|
|||
|
"structure_id 0.856471\n",
|
|||
|
"mcp_contact_id 0.297844\n",
|
|||
|
"last_buying_date 0.642312\n",
|
|||
|
"max_price 0.642312\n",
|
|||
|
"ticket_sum 0.000000\n",
|
|||
|
"average_price 0.107403\n",
|
|||
|
"fidelity 0.000000\n",
|
|||
|
"average_purchase_delay 0.642312\n",
|
|||
|
"average_price_basket 0.642312\n",
|
|||
|
"average_ticket_basket 0.642312\n",
|
|||
|
"total_price 0.534909\n",
|
|||
|
"purchase_count 0.000000\n",
|
|||
|
"first_buying_date 0.642312\n",
|
|||
|
"country 0.066622\n",
|
|||
|
"tenant_id 0.000000\n",
|
|||
|
"gender_label 0.000000\n",
|
|||
|
"gender_female 0.000000\n",
|
|||
|
"gender_male 0.000000\n",
|
|||
|
"gender_other 0.000000\n",
|
|||
|
"country_fr 0.066622\n",
|
|||
|
"nb_campaigns 0.000000\n",
|
|||
|
"nb_campaigns_opened 0.000000\n",
|
|||
|
"time_to_open 0.553988\n",
|
|||
|
"y_has_purchased 0.000000\n",
|
|||
|
"dtype: float64"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 9,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"dataset_train.isna().sum()/len(dataset_train)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"id": "05e29adb-7eef-416f-8f7b-248229eee0fe",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"nb_tickets 0\n",
|
|||
|
"nb_purchases 0\n",
|
|||
|
"total_amount 0\n",
|
|||
|
"nb_suppliers 0\n",
|
|||
|
"vente_internet_max 0\n",
|
|||
|
"nb_tickets_internet 0\n",
|
|||
|
"opt_in 0\n",
|
|||
|
"fidelity 0\n",
|
|||
|
"nb_campaigns 0\n",
|
|||
|
"nb_campaigns_opened 0\n",
|
|||
|
"dtype: int64"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 10,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"dataset_train[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']].isna().sum()\n",
|
|||
|
"# pas de NaN, OK !"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"id": "2ce94258-e2d1-472a-81fc-fc11e247b423",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"228.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 11,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"dataset_train['y_has_purchased'].sum()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"id": "34bae3f7-d579-4f80-a38d-a83eb5ea8a7b",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Accuracy: 0.9986037223669636\n",
|
|||
|
"Confusion Matrix:\n",
|
|||
|
" [[128000 37]\n",
|
|||
|
" [ 142 19]]\n",
|
|||
|
"Classification Report:\n",
|
|||
|
" precision recall f1-score support\n",
|
|||
|
"\n",
|
|||
|
" 0.0 1.00 1.00 1.00 128037\n",
|
|||
|
" 1.0 0.34 0.12 0.18 161\n",
|
|||
|
"\n",
|
|||
|
" accuracy 1.00 128198\n",
|
|||
|
" macro avg 0.67 0.56 0.59 128198\n",
|
|||
|
"weighted avg 1.00 1.00 1.00 128198\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"reg_columns = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",
|
|||
|
"\n",
|
|||
|
"X_train = dataset_train[reg_columns]\n",
|
|||
|
"y_train = dataset_train['y_has_purchased']\n",
|
|||
|
"X_test = dataset_test[reg_columns]\n",
|
|||
|
"y_test = dataset_test['y_has_purchased']\n",
|
|||
|
"\n",
|
|||
|
"# Fit and transform the scaler on the training data\n",
|
|||
|
"scaler = StandardScaler()\n",
|
|||
|
"\n",
|
|||
|
"# Transform the test data using the same scaler\n",
|
|||
|
"X_train_scaled = scaler.fit_transform(X_train)\n",
|
|||
|
"X_test_scaled = scaler.fit_transform(X_test)\n",
|
|||
|
"\n",
|
|||
|
"# Create and fit the linear regression model\n",
|
|||
|
"logit_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)\n",
|
|||
|
"logit_model.fit(X_train_scaled, y_train)\n",
|
|||
|
"\n",
|
|||
|
"y_pred = logit_model.predict(X_test_scaled)\n",
|
|||
|
"\n",
|
|||
|
"#Evaluation du modèle \n",
|
|||
|
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|||
|
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
|
|||
|
"class_report = classification_report(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Accuracy:\", accuracy)\n",
|
|||
|
"print(\"Confusion Matrix:\\n\", conf_matrix)\n",
|
|||
|
"print(\"Classification Report:\\n\", class_report)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 13,
|
|||
|
"id": "ccc78c36-3287-46e6-89ac-7494c1a7106a",
|
|||
|
"metadata": {
|
|||
|
"scrolled": true
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjYAAAHFCAYAAADhWLMfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABT1ElEQVR4nO3dfVyN9/8H8Nfp7qjU0Y3KIYQ0qY1lEpubodDN7A5ri8ZiQprcrG1otpXwxcj90GasGdpsaJk21ohEUxhDxJRCTkqq1fX7w8+1HWUV51wnx+v5fVyPh67rfa7rfU7fw3vvz+dzXTJBEAQQERER6QEDXSdAREREpCksbIiIiEhvsLAhIiIivcHChoiIiPQGCxsiIiLSGyxsiIiISG+wsCEiIiK9wcKGiIiI9AYLGyIiItIbLGxIrx07dgxvvvkmnJyc0KRJEzRt2hRPP/005s2bh+vXr2v12kePHkWfPn2gUCggk8mwePFijV9DJpMhKipK4+etS3x8PGQyGWQyGX755ZcaxwVBQIcOHSCTydC3b98Husby5csRHx/foNf88ssv982JiB4PRrpOgEhb1qxZg9DQULi4uGDatGlwdXVFZWUlDh8+jJUrV+LAgQNITEzU2vVHjx6N0tJSJCQkwMrKCm3bttX4NQ4cOIBWrVpp/Lz1ZWFhgbVr19YoXvbu3YuzZ8/CwsLigc+9fPly2NraIjg4uN6vefrpp3HgwAG4uro+8HWJ6NHGwob00oEDBzB+/HgMHDgQ3377LeRyuXhs4MCBiIiIQFJSklZzyM7ORkhICAYPHqy1a/To0UNr566P4cOHY+PGjVi2bBksLS3F/WvXroWXlxeKi4slyaOyshIymQyWlpY6/0yISLc4FEV6KTo6GjKZDKtXr1Yrau4yMTFBQECA+HN1dTXmzZuHJ554AnK5HHZ2dhg5ciQuXbqk9rq+ffvCzc0N6enpeO6552BmZoZ27dph7ty5qK6uBvDPMM3ff/+NFStWiEM2ABAVFSX++d/uvub8+fPivpSUFPTt2xc2NjYwNTVF69at8fLLL+PWrVtiTG1DUdnZ2XjhhRdgZWWFJk2aoEuXLvj888/VYu4O2Xz11Vd4//33oVQqYWlpiQEDBuDUqVP1+5ABvPbaawCAr776StynUqmwdetWjB49utbXfPjhh/D09IS1tTUsLS3x9NNPY+3atfj383jbtm2L48ePY+/eveLnd7fjdTf3DRs2ICIiAi1btoRcLseZM2dqDEVdvXoVjo6O6NmzJyorK8XznzhxAubm5ggKCqr3eyWiRwMLG9I7VVVVSElJgYeHBxwdHev1mvHjx2PGjBkYOHAgtm/fjo8++ghJSUno2bMnrl69qhabn5+P119/HW+88Qa2b9+OwYMHIzIyEl9++SUAwNfXFwcOHAAAvPLKKzhw4ID4c32dP38evr6+MDExwbp165CUlIS5c+fC3NwcFRUV933dqVOn0LNnTxw/fhxLlizBtm3b4OrqiuDgYMybN69G/HvvvYcLFy7gs88+w+rVq/Hnn3/C398fVVVV9crT0tISr7zyCtatWyfu++qrr2BgYIDhw4ff972NGzcOmzdvxrZt2/DSSy9h0qRJ+Oijj8SYxMREtGvXDl27dhU/v3uHDSMjI5Gbm4uVK1fi+++/h52dXY1r2draIiEhAenp6ZgxYwYA4NatW3j11VfRunVrrFy5sl7vk4geIQKRnsnPzxcACCNGjKhX/MmTJwUAQmhoqNr+gwcPCgCE9957T9zXp08fAYBw8OBBtVhXV1fBx8dHbR8AYcKECWr7Zs+eLdT2tVu/fr0AQMjJyREEQRC2bNkiABAyMzP/M3cAwuzZs8WfR4wYIcjlciE3N1ctbvDgwYKZmZlw48YNQRAE4eeffxYACEOGDFGL27x5swBAOHDgwH9e926+6enp4rmys7MFQRCEZ555RggODhYEQRA6d+4s9OnT577nqaqqEiorK4U5c+YINjY2QnV1tXjsfq+9e73evXvf99jPP/+stj82NlYAICQmJgqjRo0STE1NhWPHjv3neySiRxM7NvTY+/nnnwGgxiTV7t27o1OnTtizZ4/afgcHB3Tv3l1t35NPPokLFy5oLKcuXbrAxMQEY8eOxeeff45z587V63UpKSno379/jU5VcHAwbt26VaNz9O/hOODO+wDQoPfSp08ftG/fHuvWrUNWVhbS09PvOwx1N8cBAwZAoVDA0NAQxsbGmDVrFq5du4aCgoJ6X/fll1+ud+y0adPg6+uL1157DZ9//jmWLl0Kd3f3er+eiB4dLGxI79ja2sLMzAw5OTn1ir927RoAoEWLFjWOKZVK8fhdNjY2NeLkcjnKysoeINvatW/fHj/99BPs7OwwYcIEtG/fHu3bt8enn376n6+7du3afd/H3eP/du97uTsfqSHvRSaT4c0338SXX36JlStXomPHjnjuuedqjT106BC8vb0B3Fm19ttvvyE9PR3vv/9+g69b2/v8rxyDg4Nx+/ZtODg4cG4NkR5jYUN6x9DQEP3790dGRkaNyb+1ufuPe15eXo1jly9fhq2trcZya9KkCQCgvLxcbf+983gA4LnnnsP3338PlUqFtLQ0eHl5ITw8HAkJCfc9v42NzX3fBwCNvpd/Cw4OxtWrV7Fy5Uq8+eab941LSEiAsbExfvjhBwwbNgw9e/ZEt27dHuiatU3Cvp+8vDxMmDABXbp0wbVr1zB16tQHuiYRNX4sbEgvRUZGQhAEhISE1DrZtrKyEt9//z0A4PnnnwcAcfLvXenp6Th58iT69++vsbzuruw5duyY2v67udTG0NAQnp6eWLZsGQDgyJEj943t378/UlJSxELmri+++AJmZmZaWwrdsmVLTJs2Df7+/hg1atR942QyGYyMjGBoaCjuKysrw4YNG2rEaqoLVlVVhddeew0ymQy7du1CTEwMli5dim3btj30uYmo8eF9bEgveXl5YcWKFQgNDYWHhwfGjx+Pzp07o7KyEkePHsXq1avh5uYGf39/uLi4YOzYsVi6dCkMDAwwePBgnD9/HjNnzoSjoyPeeecdjeU1ZMgQWFtbY8yYMZgzZw6MjIwQHx+PixcvqsWtXLkSKSkp8PX1RevWrXH79m1x5dGAAQPue/7Zs2fjhx9+QL9+/TBr1ixYW1tj48aN2LFjB+bNmweFQqGx93KvuXPn1hnj6+uLhQsXIjAwEGPHjsW1a9ewYMGCWpfku7u7IyEhAV9//TXatWuHJk2aPNC8mNmzZ+PXX39FcnIyHBwcEBERgb1792LMmDHo2rUrnJycGnxOImq8WNiQ3goJCUH37t2xaNEixMbGIj8/H8bGxujYsSMCAwMxceJEMXbFihVo37491q5di2XLlkGhUGDQoEGIiYmpdU7Ng7K0tERSUhLCw8PxxhtvoFmzZnjrrbcwePBgvPXWW2Jcly5dkJycjNmzZyM/Px9NmzaFm5sbtm/fLs5RqY2Liwv279+P9957DxMmTEBZWRk6deqE9evXN+gOvtry/PPPY926dYiNjYW/vz9atmyJkJAQ2NnZYcyYMWqxH374IfLy8hASEoKbN2+iTZs2avf5qY/du3cjJiYGM2fOVOu8xcfHo2vXrhg+fDhSU1NhYmKiibdHRI2ATBD+dVcsIiIiokcY59gQERGR3mBhQ0RERHqDhQ0RERHpDRY2REREpDdY2BAREZHeYGFDREREeoOFDREREekNvbxBn2nXiXUHET2GitLjdJ0CUaPTRIJ/CTX171LZUX6H68KODREREekNvezYEBERNSoy9hGkwsKGiIhI22QyXWfw2GBhQ0REpG3s2EiGnzQRERHpDXZsiIiItI1DUZJhYUNERKRtHIqSDD9pIiIi0hvs2BAREWkbh6Ikw8KGiIhI2zgUJRl+0kRERKQ32LEhIiLSNg5FSYaFDRERkbZxKEoy/KSJiIhIb7BjQ0REpG0cipIMCxsiIiJt41CUZFjYEBERaRs7NpJhCUlERER6g4UNERGRtskMNLM10L59++Dv7w+lUgmZTIZvv/1WPFZZWYkZM2bA3d0d5ubmUCqVGDl
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
|
|||
|
"plt.xlabel('Predicted')\n",
|
|||
|
"plt.ylabel('Actual')\n",
|
|||
|
"plt.title('Confusion Matrix')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "fe6e14d2-001d-4585-9344-f240b84ce4af",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Ajout TP : test d'une nouvelle pipeline"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 14,
|
|||
|
"id": "3782988b-52f9-4172-92d4-68948bf259c9",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# etape supp : suppression du client 1 (outlier car client anonyme)\n",
|
|||
|
"\n",
|
|||
|
"dataset_train = dataset_train[dataset_train[\"customer_id\"]!=1]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 15,
|
|||
|
"id": "9d19f8c0-ed31-46cd-8879-47810fa099d6",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# definition des variables utilisées\n",
|
|||
|
"\n",
|
|||
|
"numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",
|
|||
|
"# categorical_features = [\"opt_in\"]\n",
|
|||
|
"encoded_features = [\"opt_in\", \"vente_internet_max\"]\n",
|
|||
|
"features = numeric_features + encoded_features\n",
|
|||
|
"X_train = dataset_train[features]\n",
|
|||
|
"y_train = dataset_train['y_has_purchased']\n",
|
|||
|
"X_test = dataset_test[features]\n",
|
|||
|
"y_test = dataset_test['y_has_purchased']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 16,
|
|||
|
"id": "412ddfad-3d20-4fa0-afaa-79ec87b3122d",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"count 122016.000000\n",
|
|||
|
"mean 0.307656\n",
|
|||
|
"std 3.135563\n",
|
|||
|
"min 0.000000\n",
|
|||
|
"25% 0.000000\n",
|
|||
|
"50% 0.000000\n",
|
|||
|
"75% 0.000000\n",
|
|||
|
"max 907.000000\n",
|
|||
|
"Name: fidelity, dtype: float64"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 16,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"### variable fidelity\n",
|
|||
|
"\n",
|
|||
|
"X_train[\"fidelity\"].describe() # sûrement un problème d'outlier pour fidelity\n",
|
|||
|
"# X_train[\"total_amount\"].describe()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"id": "97e1cd25-0961-45dd-af7f-78ab1d8088ee",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>nb_tickets</th>\n",
|
|||
|
" <th>nb_purchases</th>\n",
|
|||
|
" <th>total_amount</th>\n",
|
|||
|
" <th>nb_suppliers</th>\n",
|
|||
|
" <th>nb_tickets_internet</th>\n",
|
|||
|
" <th>fidelity</th>\n",
|
|||
|
" <th>nb_campaigns</th>\n",
|
|||
|
" <th>nb_campaigns_opened</th>\n",
|
|||
|
" <th>opt_in</th>\n",
|
|||
|
" <th>vente_internet_max</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>173</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>2233.0</td>\n",
|
|||
|
" <td>66.0</td>\n",
|
|||
|
" <td>25703.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>94</td>\n",
|
|||
|
" <td>130.0</td>\n",
|
|||
|
" <td>60.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>24</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>224</td>\n",
|
|||
|
" <td>16.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>28</th>\n",
|
|||
|
" <td>557.0</td>\n",
|
|||
|
" <td>25.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>175.0</td>\n",
|
|||
|
" <td>34</td>\n",
|
|||
|
" <td>32.0</td>\n",
|
|||
|
" <td>15.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>34</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>24</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>144823</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>144824</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>120</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>144868</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>907</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>144877</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>150595</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>279 rows × 10 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
|
|||
|
"3 0.0 0.0 0.0 0.0 \n",
|
|||
|
"15 2233.0 66.0 25703.0 2.0 \n",
|
|||
|
"24 0.0 0.0 0.0 0.0 \n",
|
|||
|
"28 557.0 25.0 0.0 2.0 \n",
|
|||
|
"34 0.0 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"144823 0.0 0.0 0.0 0.0 \n",
|
|||
|
"144824 0.0 0.0 0.0 0.0 \n",
|
|||
|
"144868 0.0 0.0 0.0 0.0 \n",
|
|||
|
"144877 0.0 0.0 0.0 0.0 \n",
|
|||
|
"150595 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" nb_tickets_internet fidelity nb_campaigns nb_campaigns_opened \\\n",
|
|||
|
"3 0.0 173 2.0 0.0 \n",
|
|||
|
"15 2.0 94 130.0 60.0 \n",
|
|||
|
"24 0.0 224 16.0 0.0 \n",
|
|||
|
"28 175.0 34 32.0 15.0 \n",
|
|||
|
"34 0.0 24 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"144823 0.0 9 0.0 0.0 \n",
|
|||
|
"144824 0.0 120 0.0 0.0 \n",
|
|||
|
"144868 0.0 907 0.0 0.0 \n",
|
|||
|
"144877 0.0 8 0.0 0.0 \n",
|
|||
|
"150595 0.0 6 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" opt_in vente_internet_max \n",
|
|||
|
"3 True 0.0 \n",
|
|||
|
"15 True 1.0 \n",
|
|||
|
"24 True 0.0 \n",
|
|||
|
"28 True 1.0 \n",
|
|||
|
"34 True 0.0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"144823 True 0.0 \n",
|
|||
|
"144824 True 0.0 \n",
|
|||
|
"144868 True 0.0 \n",
|
|||
|
"144877 True 0.0 \n",
|
|||
|
"150595 True 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[279 rows x 10 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"X_train[X_train[\"fidelity\"]>5]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 18,
|
|||
|
"id": "fc17957e-b684-41cd-880f-049a4ffcc7dc",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>customer_id</th>\n",
|
|||
|
" <th>event_type_id</th>\n",
|
|||
|
" <th>nb_tickets</th>\n",
|
|||
|
" <th>nb_purchases</th>\n",
|
|||
|
" <th>total_amount</th>\n",
|
|||
|
" <th>nb_suppliers</th>\n",
|
|||
|
" <th>vente_internet_max</th>\n",
|
|||
|
" <th>purchase_date_min</th>\n",
|
|||
|
" <th>purchase_date_max</th>\n",
|
|||
|
" <th>time_between_purchase</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>tenant_id</th>\n",
|
|||
|
" <th>gender_label</th>\n",
|
|||
|
" <th>gender_female</th>\n",
|
|||
|
" <th>gender_male</th>\n",
|
|||
|
" <th>gender_other</th>\n",
|
|||
|
" <th>country_fr</th>\n",
|
|||
|
" <th>nb_campaigns</th>\n",
|
|||
|
" <th>nb_campaigns_opened</th>\n",
|
|||
|
" <th>time_to_open</th>\n",
|
|||
|
" <th>y_has_purchased</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>125.0</td>\n",
|
|||
|
" <td>71.0</td>\n",
|
|||
|
" <td>1 days 04:13:20.492957746</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>17.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>27.0</td>\n",
|
|||
|
" <td>13.0</td>\n",
|
|||
|
" <td>5 days 18:07:22.615384615</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152554</th>\n",
|
|||
|
" <td>1256102</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152555</th>\n",
|
|||
|
" <td>1256103</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152556</th>\n",
|
|||
|
" <td>1256104</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152557</th>\n",
|
|||
|
" <td>1256105</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152558</th>\n",
|
|||
|
" <td>1256106</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>122016 rows × 42 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" customer_id event_type_id nb_tickets nb_purchases total_amount \\\n",
|
|||
|
"3 2 NaN 0.0 0.0 0.0 \n",
|
|||
|
"4 3 NaN 0.0 0.0 0.0 \n",
|
|||
|
"6 5 NaN 0.0 0.0 0.0 \n",
|
|||
|
"7 6 NaN 0.0 0.0 0.0 \n",
|
|||
|
"8 7 NaN 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"152554 1256102 NaN 0.0 0.0 0.0 \n",
|
|||
|
"152555 1256103 NaN 0.0 0.0 0.0 \n",
|
|||
|
"152556 1256104 NaN 0.0 0.0 0.0 \n",
|
|||
|
"152557 1256105 NaN 0.0 0.0 0.0 \n",
|
|||
|
"152558 1256106 NaN 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" nb_suppliers vente_internet_max purchase_date_min \\\n",
|
|||
|
"3 0.0 0.0 NaN \n",
|
|||
|
"4 0.0 0.0 NaN \n",
|
|||
|
"6 0.0 0.0 NaN \n",
|
|||
|
"7 0.0 0.0 NaN \n",
|
|||
|
"8 0.0 0.0 NaN \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"152554 0.0 0.0 NaN \n",
|
|||
|
"152555 0.0 0.0 NaN \n",
|
|||
|
"152556 0.0 0.0 NaN \n",
|
|||
|
"152557 0.0 0.0 NaN \n",
|
|||
|
"152558 0.0 0.0 NaN \n",
|
|||
|
"\n",
|
|||
|
" purchase_date_max time_between_purchase ... tenant_id gender_label \\\n",
|
|||
|
"3 NaN NaN ... 1311 male \n",
|
|||
|
"4 NaN NaN ... 1311 male \n",
|
|||
|
"6 NaN NaN ... 1311 male \n",
|
|||
|
"7 NaN NaN ... 1311 male \n",
|
|||
|
"8 NaN NaN ... 1311 female \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"152554 NaN NaN ... 1311 female \n",
|
|||
|
"152555 NaN NaN ... 1311 other \n",
|
|||
|
"152556 NaN NaN ... 1311 other \n",
|
|||
|
"152557 NaN NaN ... 1311 other \n",
|
|||
|
"152558 NaN NaN ... 1311 other \n",
|
|||
|
"\n",
|
|||
|
" gender_female gender_male gender_other country_fr nb_campaigns \\\n",
|
|||
|
"3 0 1 0 1.0 2.0 \n",
|
|||
|
"4 0 1 0 1.0 125.0 \n",
|
|||
|
"6 0 1 0 1.0 2.0 \n",
|
|||
|
"7 0 1 0 1.0 17.0 \n",
|
|||
|
"8 1 0 0 1.0 27.0 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"152554 1 0 0 1.0 0.0 \n",
|
|||
|
"152555 0 0 1 NaN 0.0 \n",
|
|||
|
"152556 0 0 1 NaN 0.0 \n",
|
|||
|
"152557 0 0 1 NaN 0.0 \n",
|
|||
|
"152558 0 0 1 NaN 0.0 \n",
|
|||
|
"\n",
|
|||
|
" nb_campaigns_opened time_to_open y_has_purchased \n",
|
|||
|
"3 0.0 NaN 0.0 \n",
|
|||
|
"4 71.0 1 days 04:13:20.492957746 0.0 \n",
|
|||
|
"6 0.0 NaN 0.0 \n",
|
|||
|
"7 0.0 NaN 0.0 \n",
|
|||
|
"8 13.0 5 days 18:07:22.615384615 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"152554 0.0 NaN 0.0 \n",
|
|||
|
"152555 0.0 NaN 0.0 \n",
|
|||
|
"152556 0.0 NaN 0.0 \n",
|
|||
|
"152557 0.0 NaN 0.0 \n",
|
|||
|
"152558 0.0 NaN 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[122016 rows x 42 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 18,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# on transforme opt_in en indicatrice\n",
|
|||
|
"\n",
|
|||
|
"dataset_train[\"opt_in\"] = dataset_train[\"opt_in\"].astype(int)\n",
|
|||
|
"dataset_train"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 16,
|
|||
|
"id": "8ad69b5d-e2e2-4d70-b8f0-ea0d37f7fe0c",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# definition des variables utilisées\n",
|
|||
|
"\n",
|
|||
|
"numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",
|
|||
|
"# categorical_features = [\"opt_in\"]\n",
|
|||
|
"encoded_features = [\"opt_in\", \"vente_internet_max\"]\n",
|
|||
|
"features = numeric_features + encoded_features\n",
|
|||
|
"X_train = dataset_train[features]\n",
|
|||
|
"y_train = dataset_train['y_has_purchased']\n",
|
|||
|
"X_test = dataset_test[features]\n",
|
|||
|
"y_test = dataset_test['y_has_purchased']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "3ed647a6-db9a-4737-b819-57cb81691ea2",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Autre ajout : travail de preprocessing des données - étude des outliers"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 60,
|
|||
|
"id": "3771eeb1-5221-44e5-a5cd-15475fbe4858",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"count 128198.000000\n",
|
|||
|
"mean 0.582536\n",
|
|||
|
"std 181.774597\n",
|
|||
|
"min 0.000000\n",
|
|||
|
"25% 0.000000\n",
|
|||
|
"50% 0.000000\n",
|
|||
|
"75% 0.000000\n",
|
|||
|
"max 65082.000000\n",
|
|||
|
"Name: nb_purchases, dtype: float64"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 60,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# 1. number of purchases\n",
|
|||
|
"\n",
|
|||
|
"X_train[\"nb_purchases\"].describe()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 84,
|
|||
|
"id": "63c44b80-88cd-4339-91b9-3764e2690316",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>nb_tickets</th>\n",
|
|||
|
" <th>nb_purchases</th>\n",
|
|||
|
" <th>total_amount</th>\n",
|
|||
|
" <th>nb_suppliers</th>\n",
|
|||
|
" <th>nb_tickets_internet</th>\n",
|
|||
|
" <th>fidelity</th>\n",
|
|||
|
" <th>nb_campaigns</th>\n",
|
|||
|
" <th>nb_campaigns_opened</th>\n",
|
|||
|
" <th>opt_in</th>\n",
|
|||
|
" <th>vente_internet_max</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>122983.0</td>\n",
|
|||
|
" <td>65082.0</td>\n",
|
|||
|
" <td>878762.5</td>\n",
|
|||
|
" <td>5.0</td>\n",
|
|||
|
" <td>9.0</td>\n",
|
|||
|
" <td>330831</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>15</th>\n",
|
|||
|
" <td>2773.0</td>\n",
|
|||
|
" <td>81.0</td>\n",
|
|||
|
" <td>32338.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>94</td>\n",
|
|||
|
" <td>126.0</td>\n",
|
|||
|
" <td>50.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>28</th>\n",
|
|||
|
" <td>282.0</td>\n",
|
|||
|
" <td>15.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>53.0</td>\n",
|
|||
|
" <td>34</td>\n",
|
|||
|
" <td>32.0</td>\n",
|
|||
|
" <td>13.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>29</th>\n",
|
|||
|
" <td>40.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>24.0</td>\n",
|
|||
|
" <td>17.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>31</th>\n",
|
|||
|
" <td>52.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>22.0</td>\n",
|
|||
|
" <td>6.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>147155</th>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>44.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>4.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>147242</th>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>40.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>3.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>147414</th>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>132.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>147636</th>\n",
|
|||
|
" <td>15.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>165.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>15.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>147950</th>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>29.0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>747 rows × 10 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
|
|||
|
"2 122983.0 65082.0 878762.5 5.0 \n",
|
|||
|
"15 2773.0 81.0 32338.0 2.0 \n",
|
|||
|
"28 282.0 15.0 0.0 2.0 \n",
|
|||
|
"29 40.0 2.0 0.0 1.0 \n",
|
|||
|
"31 52.0 2.0 0.0 1.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"147155 4.0 2.0 44.0 1.0 \n",
|
|||
|
"147242 3.0 2.0 40.0 1.0 \n",
|
|||
|
"147414 12.0 2.0 132.0 1.0 \n",
|
|||
|
"147636 15.0 2.0 165.0 1.0 \n",
|
|||
|
"147950 2.0 2.0 29.0 1.0 \n",
|
|||
|
"\n",
|
|||
|
" nb_tickets_internet fidelity nb_campaigns nb_campaigns_opened \\\n",
|
|||
|
"2 9.0 330831 0.0 0.0 \n",
|
|||
|
"15 2.0 94 126.0 50.0 \n",
|
|||
|
"28 53.0 34 32.0 13.0 \n",
|
|||
|
"29 0.0 4 24.0 17.0 \n",
|
|||
|
"31 0.0 5 22.0 6.0 \n",
|
|||
|
"... ... ... ... ... \n",
|
|||
|
"147155 4.0 2 0.0 0.0 \n",
|
|||
|
"147242 3.0 2 0.0 0.0 \n",
|
|||
|
"147414 12.0 2 0.0 0.0 \n",
|
|||
|
"147636 15.0 2 0.0 0.0 \n",
|
|||
|
"147950 2.0 2 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" opt_in vente_internet_max \n",
|
|||
|
"2 0 1.0 \n",
|
|||
|
"15 1 1.0 \n",
|
|||
|
"28 1 1.0 \n",
|
|||
|
"29 1 0.0 \n",
|
|||
|
"31 1 0.0 \n",
|
|||
|
"... ... ... \n",
|
|||
|
"147155 0 1.0 \n",
|
|||
|
"147242 0 1.0 \n",
|
|||
|
"147414 0 1.0 \n",
|
|||
|
"147636 0 1.0 \n",
|
|||
|
"147950 0 1.0 \n",
|
|||
|
"\n",
|
|||
|
"[747 rows x 10 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 84,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"X_train[X_train[\"nb_purchases\"]>1]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 65,
|
|||
|
"id": "032fbc5a-9044-41bd-b992-78077a6c8432",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"1.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 65,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"np.quantile(X_train[\"nb_purchases\"], 0.99)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 70,
|
|||
|
"id": "cad9f7cb-8b71-49a6-874b-e15cb9d7a204",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"count 128198.000000\n",
|
|||
|
"mean 1.946941\n",
|
|||
|
"std 343.940117\n",
|
|||
|
"min 0.000000\n",
|
|||
|
"25% 0.000000\n",
|
|||
|
"50% 0.000000\n",
|
|||
|
"75% 0.000000\n",
|
|||
|
"max 122983.000000\n",
|
|||
|
"Name: nb_tickets, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"23.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 70,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"### 2. nb tickets\n",
|
|||
|
"\n",
|
|||
|
"print(X_train[\"nb_tickets\"].describe())\n",
|
|||
|
"np.quantile(X_train[\"nb_tickets\"], 0.99)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 73,
|
|||
|
"id": "6bb0c86d-eb61-473d-a29b-c59e7e5af489",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"count 128198.000000\n",
|
|||
|
"mean 10.496193\n",
|
|||
|
"std 2457.094272\n",
|
|||
|
"min 0.000000\n",
|
|||
|
"25% 0.000000\n",
|
|||
|
"50% 0.000000\n",
|
|||
|
"75% 0.000000\n",
|
|||
|
"max 878762.500000\n",
|
|||
|
"Name: total_amount, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"44.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 73,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# 3. total amount\n",
|
|||
|
"\n",
|
|||
|
"print(X_train[\"total_amount\"].describe())\n",
|
|||
|
"np.quantile(X_train[\"total_amount\"], 0.99)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 76,
|
|||
|
"id": "ab6fded3-d8a5-4bb4-8f2d-472ea0e5e755",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"count 128198.000000\n",
|
|||
|
"mean 2.924687\n",
|
|||
|
"std 923.990506\n",
|
|||
|
"min 0.000000\n",
|
|||
|
"25% 0.000000\n",
|
|||
|
"50% 0.000000\n",
|
|||
|
"75% 1.000000\n",
|
|||
|
"max 330831.000000\n",
|
|||
|
"Name: fidelity, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"2.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 76,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# 4. fidelity\n",
|
|||
|
"\n",
|
|||
|
"print(X_train[\"fidelity\"].describe())\n",
|
|||
|
"np.quantile(X_train[\"fidelity\"], 0.99)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 79,
|
|||
|
"id": "c1f0ac75-71a4-43fb-844b-e006acf5927b",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"count 128198.000000\n",
|
|||
|
"mean 24.276463\n",
|
|||
|
"std 37.899868\n",
|
|||
|
"min 0.000000\n",
|
|||
|
"25% 1.000000\n",
|
|||
|
"50% 4.000000\n",
|
|||
|
"75% 28.000000\n",
|
|||
|
"max 299.000000\n",
|
|||
|
"Name: nb_campaigns, dtype: float64\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"133.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 79,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# 5. nb campaigns - semble pas aberrant meme si forte variance\n",
|
|||
|
"\n",
|
|||
|
"print(X_train[\"nb_campaigns\"].describe())\n",
|
|||
|
"np.quantile(X_train[\"nb_campaigns\"], 0.99)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 80,
|
|||
|
"id": "8bb01064-1c23-4100-ace8-56f155e0b4ab",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"### on retire les outliers - variables : nb purchases, nb tickets, total amount, fidelity\n",
|
|||
|
"\n",
|
|||
|
"p99_nb_purchases = np.quantile(X_train[\"nb_purchases\"], 0.99)\n",
|
|||
|
"p99_nb_tickets = np.quantile(X_train[\"nb_tickets\"], 0.99)\n",
|
|||
|
"p99_total_amount = np.quantile(X_train[\"total_amount\"], 0.99)\n",
|
|||
|
"p99_fidelity = np.quantile(X_train[\"fidelity\"], 0.99)\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 98,
|
|||
|
"id": "b2b43ab6-16aa-41bc-9a62-47ab769c5bf2",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# filtre - on enlève les valeurs aberrantes sur les variables problématiques (retire 2% des valeurs en tt)\n",
|
|||
|
"\n",
|
|||
|
"X_train = X_train.loc[(X_train[\"nb_purchases\"] <= p99_nb_purchases) &\n",
|
|||
|
"(X_train[\"nb_tickets\"] <= p99_nb_tickets) &\n",
|
|||
|
"(X_train[\"total_amount\"] <= p99_total_amount) &\n",
|
|||
|
"(X_train[\"fidelity\"] <= p99_fidelity)]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 99,
|
|||
|
"id": "b254a671-9e57-4123-ae65-55c852eb64cd",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>nb_tickets</th>\n",
|
|||
|
" <th>nb_purchases</th>\n",
|
|||
|
" <th>total_amount</th>\n",
|
|||
|
" <th>nb_suppliers</th>\n",
|
|||
|
" <th>nb_tickets_internet</th>\n",
|
|||
|
" <th>fidelity</th>\n",
|
|||
|
" <th>nb_campaigns</th>\n",
|
|||
|
" <th>nb_campaigns_opened</th>\n",
|
|||
|
" <th>opt_in</th>\n",
|
|||
|
" <th>vente_internet_max</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>24.0</td>\n",
|
|||
|
" <td>10.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>14.0</td>\n",
|
|||
|
" <td>7.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>23.0</td>\n",
|
|||
|
" <td>11.0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" nb_tickets nb_purchases total_amount nb_suppliers nb_tickets_internet \\\n",
|
|||
|
"6 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"7 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"8 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"9 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"10 0.0 0.0 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" fidelity nb_campaigns nb_campaigns_opened opt_in vente_internet_max \n",
|
|||
|
"6 1 2.0 0.0 1 0.0 \n",
|
|||
|
"7 1 12.0 0.0 1 0.0 \n",
|
|||
|
"8 1 24.0 10.0 1 0.0 \n",
|
|||
|
"9 1 14.0 7.0 1 0.0 \n",
|
|||
|
"10 1 23.0 11.0 1 0.0 "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 99,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"X_train.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 101,
|
|||
|
"id": "86d90380-6ad2-4c6b-a103-53e4c1fa59e0",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>customer_id</th>\n",
|
|||
|
" <th>event_type_id</th>\n",
|
|||
|
" <th>nb_tickets</th>\n",
|
|||
|
" <th>nb_purchases</th>\n",
|
|||
|
" <th>total_amount</th>\n",
|
|||
|
" <th>nb_suppliers</th>\n",
|
|||
|
" <th>vente_internet_max</th>\n",
|
|||
|
" <th>purchase_date_min</th>\n",
|
|||
|
" <th>purchase_date_max</th>\n",
|
|||
|
" <th>time_between_purchase</th>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <th>tenant_id</th>\n",
|
|||
|
" <th>gender_label</th>\n",
|
|||
|
" <th>gender_female</th>\n",
|
|||
|
" <th>gender_male</th>\n",
|
|||
|
" <th>gender_other</th>\n",
|
|||
|
" <th>country_fr</th>\n",
|
|||
|
" <th>nb_campaigns</th>\n",
|
|||
|
" <th>nb_campaigns_opened</th>\n",
|
|||
|
" <th>time_to_open</th>\n",
|
|||
|
" <th>y_has_purchased</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6</th>\n",
|
|||
|
" <td>5</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>2.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>7</th>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>male</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>12.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8</th>\n",
|
|||
|
" <td>7</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>24.0</td>\n",
|
|||
|
" <td>10.0</td>\n",
|
|||
|
" <td>5 days 11:58:52</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>9</th>\n",
|
|||
|
" <td>8</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>14.0</td>\n",
|
|||
|
" <td>7.0</td>\n",
|
|||
|
" <td>0 days 13:29:25.714285714</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10</th>\n",
|
|||
|
" <td>9</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>23.0</td>\n",
|
|||
|
" <td>11.0</td>\n",
|
|||
|
" <td>0 days 17:17:44.090909090</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152645</th>\n",
|
|||
|
" <td>1256102</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>female</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152646</th>\n",
|
|||
|
" <td>1256103</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152647</th>\n",
|
|||
|
" <td>1256104</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152648</th>\n",
|
|||
|
" <td>1256105</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>152649</th>\n",
|
|||
|
" <td>1256106</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>1311</td>\n",
|
|||
|
" <td>other</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" <td>0.0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>125792 rows × 42 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" customer_id event_type_id nb_tickets nb_purchases total_amount \\\n",
|
|||
|
"6 5 NaN 0.0 0.0 0.0 \n",
|
|||
|
"7 6 NaN 0.0 0.0 0.0 \n",
|
|||
|
"8 7 NaN 0.0 0.0 0.0 \n",
|
|||
|
"9 8 NaN 0.0 0.0 0.0 \n",
|
|||
|
"10 9 NaN 0.0 0.0 0.0 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"152645 1256102 NaN 0.0 0.0 0.0 \n",
|
|||
|
"152646 1256103 NaN 0.0 0.0 0.0 \n",
|
|||
|
"152647 1256104 NaN 0.0 0.0 0.0 \n",
|
|||
|
"152648 1256105 NaN 0.0 0.0 0.0 \n",
|
|||
|
"152649 1256106 NaN 0.0 0.0 0.0 \n",
|
|||
|
"\n",
|
|||
|
" nb_suppliers vente_internet_max purchase_date_min \\\n",
|
|||
|
"6 0.0 0.0 NaN \n",
|
|||
|
"7 0.0 0.0 NaN \n",
|
|||
|
"8 0.0 0.0 NaN \n",
|
|||
|
"9 0.0 0.0 NaN \n",
|
|||
|
"10 0.0 0.0 NaN \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"152645 0.0 0.0 NaN \n",
|
|||
|
"152646 0.0 0.0 NaN \n",
|
|||
|
"152647 0.0 0.0 NaN \n",
|
|||
|
"152648 0.0 0.0 NaN \n",
|
|||
|
"152649 0.0 0.0 NaN \n",
|
|||
|
"\n",
|
|||
|
" purchase_date_max time_between_purchase ... tenant_id gender_label \\\n",
|
|||
|
"6 NaN NaN ... 1311 male \n",
|
|||
|
"7 NaN NaN ... 1311 male \n",
|
|||
|
"8 NaN NaN ... 1311 female \n",
|
|||
|
"9 NaN NaN ... 1311 female \n",
|
|||
|
"10 NaN NaN ... 1311 female \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"152645 NaN NaN ... 1311 female \n",
|
|||
|
"152646 NaN NaN ... 1311 other \n",
|
|||
|
"152647 NaN NaN ... 1311 other \n",
|
|||
|
"152648 NaN NaN ... 1311 other \n",
|
|||
|
"152649 NaN NaN ... 1311 other \n",
|
|||
|
"\n",
|
|||
|
" gender_female gender_male gender_other country_fr nb_campaigns \\\n",
|
|||
|
"6 0 1 0 1.0 2.0 \n",
|
|||
|
"7 0 1 0 1.0 12.0 \n",
|
|||
|
"8 1 0 0 1.0 24.0 \n",
|
|||
|
"9 1 0 0 1.0 14.0 \n",
|
|||
|
"10 1 0 0 1.0 23.0 \n",
|
|||
|
"... ... ... ... ... ... \n",
|
|||
|
"152645 1 0 0 1.0 0.0 \n",
|
|||
|
"152646 0 0 1 NaN 0.0 \n",
|
|||
|
"152647 0 0 1 NaN 0.0 \n",
|
|||
|
"152648 0 0 1 NaN 0.0 \n",
|
|||
|
"152649 0 0 1 NaN 0.0 \n",
|
|||
|
"\n",
|
|||
|
" nb_campaigns_opened time_to_open y_has_purchased \n",
|
|||
|
"6 0.0 NaN 0.0 \n",
|
|||
|
"7 0.0 NaN 0.0 \n",
|
|||
|
"8 10.0 5 days 11:58:52 0.0 \n",
|
|||
|
"9 7.0 0 days 13:29:25.714285714 0.0 \n",
|
|||
|
"10 11.0 0 days 17:17:44.090909090 0.0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"152645 0.0 NaN 0.0 \n",
|
|||
|
"152646 0.0 NaN 0.0 \n",
|
|||
|
"152647 0.0 NaN 0.0 \n",
|
|||
|
"152648 0.0 NaN 0.0 \n",
|
|||
|
"152649 0.0 NaN 0.0 \n",
|
|||
|
"\n",
|
|||
|
"[125792 rows x 42 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 101,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"dataset_train = dataset_train.loc[(dataset_train[\"nb_purchases\"] <= p99_nb_purchases) &\n",
|
|||
|
"(dataset_train[\"nb_tickets\"] <= p99_nb_tickets) &\n",
|
|||
|
"(dataset_train[\"total_amount\"] <= p99_total_amount) &\n",
|
|||
|
"(dataset_train[\"fidelity\"] <= p99_fidelity)]\n",
|
|||
|
"\n",
|
|||
|
"dataset_train"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "f9487c48-b973-4d9e-abb9-902800ab778f",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"En enlevant les outliers, on supprime la plupart des clients ayant acheté à nouveau ... Il faut trouver un autre moyen de preprocessing qui ne dégrade pas le dataset"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 102,
|
|||
|
"id": "9fe7513b-f23b-4bee-957d-f98919d6eb30",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"19.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 102,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"dataset_train[\"y_has_purchased\"].sum() # pb : on passe de 161 à 19 clients ayant acheté ..."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "b531aebb-3b2f-4c62-ae01-84bdf8e45f49",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"### Construction de la pipeline pour le modèle de régression logistique et résultats"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"id": "1476da0d-cbb5-46ac-9f97-10855eec0108",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# importations pr créer la pipeline\n",
|
|||
|
"\n",
|
|||
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
|||
|
"from sklearn.pipeline import Pipeline\n",
|
|||
|
"from sklearn.compose import ColumnTransformer\n",
|
|||
|
"from sklearn.preprocessing import OneHotEncoder\n",
|
|||
|
"from sklearn.impute import SimpleImputer\n",
|
|||
|
"from sklearn.linear_model import LogisticRegression\n",
|
|||
|
"from sklearn.model_selection import GridSearchCV\n",
|
|||
|
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
|
|||
|
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 18,
|
|||
|
"id": "f905cb6f-b0be-4a47-ac8d-7b3e16ff1dce",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# debut de la pipeline\n",
|
|||
|
"numeric_transformer = Pipeline(steps=[\n",
|
|||
|
" # (\"imputer\", SimpleImputer(strategy=\"mean\")), # NaN remplacés par la moyenne, mais peu importe car on a supprimé les valeurs manquantes\n",
|
|||
|
" (\"scaler\", StandardScaler())])\n",
|
|||
|
"\"\"\"\n",
|
|||
|
"categorical_transformer = Pipeline(steps=[\n",
|
|||
|
" (\"imputer\", SimpleImputer(strategy=\"constant\", fill_value=\"Not defined\")),\n",
|
|||
|
" (\"onehot\", OneHotEncoder(handle_unknown='ignore'))]) # to deal with missing categorical data\n",
|
|||
|
"\n",
|
|||
|
"\"\"\"\n",
|
|||
|
"preproc = ColumnTransformer(transformers=[(\"num\", numeric_transformer, numeric_features)])\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 20,
|
|||
|
"id": "d322fb8f-1e97-4a44-96ca-c0f5d7ebd383",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Returned hyperparameter: {'logreg__C': 0.0009765625, 'logreg__class_weight': 'balanced'}\n",
|
|||
|
"Best classification accuracy in train is: 0.25403118665289387\n",
|
|||
|
"Classification accuracy on test is: 0.0495079950799508\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# on doit prendre une métrique adaptée aux datasets déséquilibrés\n",
|
|||
|
"balanced_scorer = make_scorer(balanced_accuracy_score)\n",
|
|||
|
"f1_scorer = make_scorer(f1_score)\n",
|
|||
|
"\n",
|
|||
|
"parameter_space = np.logspace(-10, 6, 17, base=2)\n",
|
|||
|
"\n",
|
|||
|
"pipe = Pipeline([('preprocessor', preproc), ('logreg', LogisticRegression(max_iter=500))]) # prendre 5k iter\n",
|
|||
|
"# on met plus de poids sur les observations rares (utile pr gérer le déséquilibre du dataset)\n",
|
|||
|
"parameters4 = {'logreg__C': parameter_space, 'logreg__class_weight': ['balanced']} \n",
|
|||
|
"clf4 = GridSearchCV(pipe, parameters4, cv=3, scoring = f1_scorer)\n",
|
|||
|
"clf4.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"# print results\n",
|
|||
|
"# print(clf4.cv_results_)\n",
|
|||
|
"print('Returned hyperparameter: {}'.format(clf4.best_params_))\n",
|
|||
|
"print('Best classification accuracy in train is: {}'.format(clf4.best_score_))\n",
|
|||
|
"print('Classification accuracy on test is: {}'.format(clf4.score(X_test, y_test)))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 21,
|
|||
|
"id": "b32bb668-c816-4055-b786-e548eb71f318",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Accuracy: 0.9517777188411676\n",
|
|||
|
"Confusion Matrix:\n",
|
|||
|
" [[121855 6182]\n",
|
|||
|
" [ 0 161]]\n",
|
|||
|
"Classification Report:\n",
|
|||
|
" precision recall f1-score support\n",
|
|||
|
"\n",
|
|||
|
" 0.0 1.00 0.95 0.98 128037\n",
|
|||
|
" 1.0 0.03 1.00 0.05 161\n",
|
|||
|
"\n",
|
|||
|
" accuracy 0.95 128198\n",
|
|||
|
" macro avg 0.51 0.98 0.51 128198\n",
|
|||
|
"weighted avg 1.00 0.95 0.97 128198\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# visualisation des résultats \n",
|
|||
|
"\n",
|
|||
|
"y_pred = clf4.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"#Evaluation du modèle \n",
|
|||
|
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|||
|
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
|
|||
|
"class_report = classification_report(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Accuracy:\", accuracy)\n",
|
|||
|
"print(\"Confusion Matrix:\\n\", conf_matrix)\n",
|
|||
|
"print(\"Classification Report:\\n\", class_report)\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 22,
|
|||
|
"id": "faebbecb-3f85-4181-8005-2f52180fa37e",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjYAAAHFCAYAAADhWLMfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABV70lEQVR4nO3de1xVVf7/8dcR4YgoR0QBMcwbkYSTimVopeZd0a6jRpFMhpa3SDSzi2I1oOZXy7vddMbJodJsrJTRtCxT1FBKzbLygo4gXhATEQj27w9/npkjqKDnoqf3s8d+PGTvz1577ePjxMfPWmtvk2EYBiIiIiJuoJqrOyAiIiJiL0psRERExG0osRERERG3ocRGRERE3IYSGxEREXEbSmxERETEbSixEREREbehxEZERETchhIbERERcRtKbMStff/99/zlL3+hSZMm1KhRg1q1atGmTRumTp3KiRMnHHrt7du307FjRywWCyaTiddff93u1zCZTCQlJdm93ctZtGgRJpMJk8nEl19+We64YRg0b94ck8lEp06drugac+fOZdGiRVU658svv7xon0Tkj6G6qzsg4ihvvfUWw4YNIywsjLFjxxIeHk5JSQnffvst8+fPZ9OmTSxfvtxh13/88ccpKCggNTUVPz8/GjdubPdrbNq0iRtuuMHu7VZW7dq1eeedd8olL+vXr+fXX3+ldu3aV9z23LlzqVevHnFxcZU+p02bNmzatInw8PArvq6IXN+U2Ihb2rRpE0899RTdunXj448/xmw2W49169aNxMRE0tLSHNqHnTt3Eh8fT69evRx2jTvuuMNhbVfGgAEDeO+995gzZw6+vr7W/e+88w5RUVGcOnXKKf0oKSnBZDLh6+vr8s9ERFxLQ1HilpKTkzGZTLz55ps2Sc15Xl5e9OvXz/pzWVkZU6dO5eabb8ZsNhMQEMBjjz3GoUOHbM7r1KkTERERbN26lbvuuouaNWvStGlTJk+eTFlZGfDfYZrff/+defPmWYdsAJKSkqx//l/nz9m/f79137p16+jUqRP+/v54e3vTqFEjHnzwQc6cOWONqWgoaufOndx77734+flRo0YNWrVqxd/+9jebmPNDNv/85z954YUXCA4OxtfXl65du/LTTz9V7kMGHn74YQD++c9/Wvfl5+ezbNkyHn/88QrPmTRpEu3ataNu3br4+vrSpk0b3nnnHf73fbyNGzdm165drF+/3vr5na94ne/74sWLSUxMpGHDhpjNZn755ZdyQ1HHjh0jJCSE9u3bU1JSYm3/hx9+wMfHh9jY2Erfq4hcH5TYiNspLS1l3bp1REZGEhISUqlznnrqKcaNG0e3bt1YsWIFr7zyCmlpabRv355jx47ZxObk5PDII4/w6KOPsmLFCnr16sX48eP5xz/+AUCfPn3YtGkTAA899BCbNm2y/lxZ+/fvp0+fPnh5efHuu++SlpbG5MmT8fHxobi4+KLn/fTTT7Rv355du3Yxc+ZMPvroI8LDw4mLi2Pq1Knl4p9//nkOHDjA22+/zZtvvsnPP/9M3759KS0trVQ/fX19eeihh3j33Xet+/75z39SrVo1BgwYcNF7Gzp0KB988AEfffQRDzzwACNHjuSVV16xxixfvpymTZvSunVr6+d34bDh+PHjycrKYv78+XzyyScEBASUu1a9evVITU1l69atjBs3DoAzZ87w5z//mUaNGjF//vxK3aeIXEcMETeTk5NjAMbAgQMrFb97924DMIYNG2azf/PmzQZgPP/889Z9HTt2NABj8+bNNrHh4eFGjx49bPYBxvDhw232TZw40ajoa7dw4UIDMPbt22cYhmEsXbrUAIzMzMxL9h0wJk6caP154MCBhtlsNrKysmzievXqZdSsWdM4efKkYRiG8cUXXxiA0bt3b5u4Dz74wACMTZs2XfK65/u7detWa1s7d+40DMMwbrvtNiMuLs4wDMO45ZZbjI4dO160ndLSUqOkpMR4+eWXDX9/f6OsrMx67GLnnr/e3XfffdFjX3zxhc3+KVOmGICxfPlyY9CgQYa3t7fx/fffX/IeReT6pIqN/OF98cUXAOUmqd5+++20aNGCtWvX2uwPCgri9ttvt9n3pz/9iQMHDtitT61atcLLy4shQ4bwt7/9jb1791bqvHXr1tGlS5dylaq4uDjOnDlTrnL0v8NxcO4+gCrdS8eOHWnWrBnvvvsuO3bsYOvWrRcdhjrfx65du2KxWPDw8MDT05MJEyZw/PhxcnNzK33dBx98sNKxY8eOpU+fPjz88MP87W9/Y9asWbRs2bLS54vI9UOJjbidevXqUbNmTfbt21ep+OPHjwPQoEGDcseCg4Otx8/z9/cvF2c2myksLLyC3lasWbNmfP755wQEBDB8+HCaNWtGs2bNeOONNy553vHjxy96H+eP/68L7+X8fKSq3IvJZOIvf/kL//jHP5g/fz433XQTd911V4WxW7ZsoXv37sC5VWvffPMNW7du5YUXXqjydSu6z0v1MS4ujrNnzxIUFKS5NSJuTImNuB0PDw+6dOlCRkZGucm/FTn/yz07O7vcscOHD1OvXj279a1GjRoAFBUV2ey/cB4PwF133cUnn3xCfn4+6enpREVFkZCQQGpq6kXb9/f3v+h9AHa9l/8VFxfHsWPHmD9/Pn/5y18uGpeamoqnpyeffvop/fv3p3379rRt2/aKrlnRJOyLyc7OZvjw4bRq1Yrjx48zZsyYK7qmiFz7lNiIWxo/fjyGYRAfH1/hZNuSkhI++eQTAO655x4A6+Tf87Zu3cru3bvp0qWL3fp1fmXP999/b7P/fF8q4uHhQbt27ZgzZw4A27Ztu2hsly5dWLdunTWROe/vf/87NWvWdNhS6IYNGzJ27Fj69u3LoEGDLhpnMpmoXr06Hh4e1n2FhYUsXry4XKy9qmClpaU8/PDDmEwmVq1aRUpKCrNmzeKjjz666rZF5Nqj59iIW4qKimLevHkMGzaMyMhInnrqKW655RZKSkrYvn07b775JhEREfTt25ewsDCGDBnCrFmzqFatGr169WL//v289NJLhISE8Mwzz9itX71796Zu3boMHjyYl19+merVq7No0SIOHjxoEzd//nzWrVtHnz59aNSoEWfPnrWuPOratetF2584cSKffvopnTt3ZsKECdStW5f33nuPzz77jKlTp2KxWOx2LxeaPHnyZWP69OnD9OnTiYmJYciQIRw/fpxp06ZVuCS/ZcuWpKam8v7779O0aVNq1KhxRfNiJk6cyNdff83q1asJCgoiMTGR9evXM3jwYFq3bk2TJk2q3KaIXLuU2Ijbio+P5/bbb2fGjBlMmTKFnJwcPD09uemmm4iJiWHEiBHW2Hnz5tGsWTPeeecd5syZg8VioWfPnqSkpFQ4p+ZK+fr6kpaWRkJCAo8++ih16tThiSeeoFevXjzxxBPWuFatWrF69WomTpxITk4OtWrVIiIighUrVljnqFQkLCyMjRs38vzzzzN8+HAKCwtp0aIFCxcurNITfB3lnnvu4d1332XKlCn07duXhg0bEh8fT0BAAIMHD7aJnTRpEtnZ2cTHx/Pbb79x44032jznpzLWrFlDSkoKL730kk3lbdGiRbRu3ZoBAwawYcMGvLy87HF7InINMBnG/zwVS0REROQ6pjk2IiIi4jaU2IiIiIjbUGIjIiIibkOJjYiIiLgNJTYiIiLiNpTYiIiIiNtQYiMiIiJuwy0f0OfdesTlg0T+gA5/c+mXaIr8EfnV9Lh80FWy1++lwu2z7dKOO1PFRkRExE199dVX9O3bl+DgYEwmEx9//LH1WElJCePGjaNly5b4+PgQHBzMY489Vu5dc0VFRYwcOZJ69erh4+NDv379yr1gOC8vj9jYWCwWCxaLhdjYWE6ePGkTk5WVRd++ffHx8aFevXqMGjWq3Lv8duzYQceOHfH29qZhw4a8/PLLVPU5wkpsREREHM1UzT5bFRUUFHDrrbcye3b5Ss+ZM2fYtm0bL730Etu2beOjjz5iz5499OvXzyYuISGB5cuXk5qayoYNGzh9+jTR0dGUlpZaY2JiYsj
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# matrice de confusion\n",
|
|||
|
"\n",
|
|||
|
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
|
|||
|
"plt.xlabel('Predicted')\n",
|
|||
|
"plt.ylabel('Actual')\n",
|
|||
|
"plt.title('Confusion Matrix')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"id": "dc66d09e-3f7b-4f6d-a60f-c21a3a057c6d",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAArMAAAIiCAYAAADb3UD9AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAACJyElEQVR4nOzdd1hT1x8G8DcJGwFFhqKIew8UF1q34l5Vwb133VWrdY+6V63bOurGKlpXVax7L9Qq1okbVECGbMj5/cGPQASVIOGS8H6eJ4/3ntybvMlF+Obk3HNlQggBIiIiIiIdJJc6ABERERFRRrGYJSIiIiKdxWKWiIiIiHQWi1kiIiIi0lksZomIiIhIZ7GYJSIiIiKdxWKWiIiIiHQWi1kiIiIi0lksZomIiIhIZ7GYJSKir5oxYwby5s2L58+fSx2FiEgNi1kiHXfnzh306dMHRYoUgYmJCXLlyoUqVapgwYIFCA4OliTT5s2bIZPJcP36da0+z7NnzyCTyVQ3uVyOPHnyoFGjRjh+/Phn9zt69ChatmwJW1tbGBsbw9HREb169YKvr+9n9zl37hzc3d1RoEABGBkZwcrKCrVq1cLq1asRERGhjZcniaRj9+zZM7X2adOmoX379ujUqRNiY2PT3Hf69OmQyWSZluX06dOQyWQ4ffp0pj1mWgoXLozevXtrtM/Fixcxffp0hISEpLqvfv36qF+/fqZkI6KvYzFLpMPWr18PFxcXXLt2DePGjcPRo0exb98+dOrUCWvWrEG/fv2kjpglhg8fjkuXLuHcuXNYtGgRHj16hBYtWuDs2bOpth0/fjyaN28OpVKJVatWwdvbG9OmTcO1a9dQpUoVeHl5pdpn2rRpqFu3Ll6/fo1Zs2bB29sbu3btQqNGjTB9+nRMnjw5K16m5NasWQNbW1uMHj1a6iiZat++fZgyZYpG+1y8eBEzZsxIs5hdtWoVVq1alUnpiOhrDKQOQEQZc+nSJQwZMgRNmjTB/v37YWxsrLqvSZMm+PHHH3H06NEszRQXF5epPXPpVahQIdSsWRMAULt2bZQoUQL16tXDhg0bULduXdV2O3fuxMKFCzFkyBC1YqNu3bro0qUL6tWrhx49esDZ2RlFixYFAPz555+YOXMm+vXrh/Xr16u9vubNm2P8+PG4dOlSFr1SaRkYGODw4cNSx8h0lStXztTHK1u2bKY+HhF9GXtmiXTUnDlzIJPJsG7dOrVCNomRkRHatGmjWlcqlViwYAFKly4NY2Nj2NnZoWfPnnj16pXafp/7yvXTr06TvgLeunUrfvzxRxQoUADGxsZ4/PixapsPHz6gT58+sLa2hrm5OVq3bo2nT5+meuwTJ06gUaNGsLS0hJmZGWrXro1//vknA+9KoqpVqwIA3r59q9b+yy+/IE+ePFi0aFGqfczNzfHbb78hMjISS5cuVbXPnDkTefLkwfLly9Ms1C0sLODm5pbhrJ+qX78+ypcvj0uXLqFWrVowNTVF4cKFsWnTJgDA4cOHUaVKFZiZmaFChQppfmA5f/48GjVqBAsLC5iZmaFWrVppFqGXL19G7dq1YWJiAgcHB0ycOBFxcXFp5vL09ISrqyvMzc2RK1cuuLm54caNG+l6TZ/u27RpU/j4+Gjwrqg7cOAAXF1dYWZmBgsLCzRp0iTNDxR//fUXKlasCGNjYxQtWhS//vprmkMhPv2ZVyqVmD17NkqVKgVTU1Pkzp0bFStWxK+//gogcTjFuHHjAABFihRRDXNJGg6R1jCDN2/ewN3dHRYWFrCysoKHhwcuX74MmUyGzZs3q7b73BCF3r17o3DhwmptsbGxmD17tur/tK2tLfr06YP379+n740k0hMsZol0UEJCAk6ePAkXFxc4Ojqma58hQ4bgp59+QpMmTXDgwAHMmjULR48eRa1atRAYGJjhLBMnTsSLFy+wZs0aHDx4EHZ2dqr7+vXrB7lcjh07dmDZsmW4evUq6tevr/bV7LZt2+Dm5gZLS0v88ccf2L17N6ytrdG0adMMF7R+fn4AgJIlS6ra/P39ce/ePbi5ucHMzCzN/VxdXWFnZwdvb2/VPnfv3v3iPumRVPhPnz49XdsHBASgT58+6N+/P/766y9UqFABffv2xcyZMzFx4kSMHz8ee/fuRa5cudCuXTu8efNGte+ZM2fQsGFDhIaGYsOGDdi5cycsLCzQunVreHp6qrbz9fVFo0aNEBISgs2bN2PNmjXw8fHB7NmzU+WZM2cOunTpgrJly2L37t3YsmULwsLCUKdOHdy9e/eLr+XTfbdu3Yrw8HDUqVPni2OUP2fHjh1o27YtLC0tsXPnTmzYsAEfPnxA/fr1cf78edV2R48exffff4+8efPC09MTCxYswM6dO/HHH3989TkWLFiA6dOno0uXLjh8+DA8PT3Rr18/1c9t//79MXz4cACAl5cXLl26hEuXLqFKlSppPl5UVBQaN26M48ePY+7cufjzzz+RL18+eHh4aPz6kyiVSrRt2xbz5s1D165dcfjwYcybNw/e3t6oX78+oqKiMvzYRDpHEJHOCQgIEABE586d07X9/fv3BQAxdOhQtfYrV64IAOLnn39WtTk5OYlevXqleox69eqJevXqqdZPnTolAIi6deum2nbTpk0CgGjfvr1a+4ULFwQAMXv2bCGEEBEREcLa2lq0bt1abbuEhARRqVIlUb169S++Lj8/PwFAzJ8/X8TFxYno6Ghx69Yt4erqKvLnzy/8/PxU216+fFkAEBMmTPjiY9aoUUOYmppqtM/XnD59WigUCjFjxoyvbluvXj0BQFy/fl3VFhQUJBQKhTA1NRWvX79Wtd+6dUsAEMuXL1e11axZU9jZ2Ynw8HBVW3x8vChfvrwoWLCgUCqVQgghPDw8hKmpqQgICFDbrnTp0gKA6r178eKFMDAwED/88INazrCwMGFnZyc6duyoaps2bZpI+Wclad/hw4er7RseHi7y5csn3N3dv/heJP2MnTp1SgiR+HPh4OAgKlSoIBISEtQez87OTtSqVUvVVq1aNeHo6ChiYmLUtsubN6/49E/fpz/zrVq1Es7Ozl/MtnDhQrX3KaVP/6+sXr1aABB//fWX2nYDBgwQAMSmTZs+u2+SXr16CScnJ9X6zp07BQCxd+9ete2uXbsmAIhVq1Z9MT+RPmHPLFEOcOrUKQBINXygevXqKFOmzDd9pd+hQ4fP3tetWze19Vq1asHJyUmV5+LFiwgODkavXr0QHx+vuimVSjRr1gzXrl1L10wBP/30EwwNDWFiYgJnZ2fcvXsXBw8eTPW1bHoIITJ93G+9evUQHx+PqVOnpmv7/Pnzw8XFRbVubW0NOzs7ODs7w8HBQdVepkwZAFBNlxUREYErV66gY8eOyJUrl2o7hUKBHj164NWrV3jw4AGAxJ+JRo0awd7eXm27T3sLjx07hvj4ePTt21et3cLCAg0aNMCZM2c++zqS9u3Zs6fa8TUxMUG9evU0nqXgwYMHePPmDXr06AG5PPnPV65cudChQwdcvnwZkZGRiIiIwPXr19GuXTsYGRmpbde6deuvPk/16tVx+/ZtDB06FMeOHUNYWJhGOT916tQpWFhYqA37AYCuXbtm+DEPHTqE3Llzo3Xr1mrvrbOzM/Lly6f1GSCIshOeAEakg2xsbGBmZqb6Ov1rgoKCACQWSZ9ycHD4prlD03rMJPny5UuzLSlP0pjWjh07fvYxgoODYW5u/sUMI0eORPfu3RETE4PLly9j8uTJaNu2LW7fvo28efMCSDxJDMBX37Pnz5+rhm6kd5/MZm1tnarNyMgoVXtSoRYdHQ0gcYyyEOKzxxlI/lkICgr67PFJKekY1apVK9W2SR88Pidp32rVqqV5f8qCND2+9nOsVCpV74EQQq1QT5JW26cmTpwIc3NzbNu2DWvWrIFCoUDdunUxf/581XhsTXOn9bxpvf/p9fbtW4SEhKgV6yl9y9AhIl3DYpZIBykUCjRq1Ah///03Xr16hYIFC35x+6SCzt/fP9W2b968gY2NjWrdxMQEMTExqR4jMDBQbbskX+rFDAgISLOtePHiAKB6vN9
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# on trace la courbe ROC\n",
|
|||
|
"\n",
|
|||
|
"# Prédictions sur l'ensemble de test\n",
|
|||
|
"y_pred_prob = clf4.predict_proba(X_test)[:, 1]\n",
|
|||
|
"\n",
|
|||
|
"# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)\n",
|
|||
|
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)\n",
|
|||
|
"\n",
|
|||
|
"# Calcul de l'aire sous la courbe ROC (AUC)\n",
|
|||
|
"roc_auc = auc(fpr, tpr)\n",
|
|||
|
"\n",
|
|||
|
"# Tracé de la courbe ROC\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')\n",
|
|||
|
"plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n",
|
|||
|
"plt.xlabel('Taux de faux positifs (FPR)')\n",
|
|||
|
"plt.ylabel('Taux de vrais positifs (TPR)')\n",
|
|||
|
"plt.title('Courbe ROC : modèle logistique')\n",
|
|||
|
"plt.legend(loc='lower right')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 24,
|
|||
|
"id": "b36a11db-5d7a-487a-9b22-f02339e6d413",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAr4AAAIhCAYAAACot7njAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABYH0lEQVR4nO3deXyU1d3///dkkkwWSCAEQiAQwr4pShAFpCAgFfza+rtdaLVFKVapG0u1ihtirbS2WqsVtApS76pFLXrX+0YErVJaEGVTFBBkS1hCCJCF7Mmc3x+nk8mQAEmYZEiu1/PxmEdynTnXNWcyoO8cPtc5LmOMEQAAANDChYV6AAAAAEBTIPgCAADAEQi+AAAAcASCLwAAAByB4AsAAABHIPgCAADAEQi+AAAAcASCLwAAAByB4AsAAABHIPgCqPLll19qypQpSktLU1RUlFq1aqXBgwfrySef1LFjx0IypsWLF8vlcmn9+vWN+jp79+6Vy+WqeoSFhaldu3aaOHGi1q5dG9C3ej+Xy6W4uDgNHz5cb7zxRr1ec/Xq1fJ4PNq3b1+tzw8ePFgul0u/+93van3+0UcflcvlUk5OTq3PDxw4UKNHj67RfvjwYd1///0677zz1KpVK0VFRalXr16aPn26du7cecZx+z4T3yM8PFwpKSmaMmWKDhw4UNXvk08+CejndrvVvn17XXXVVY3+eZ5KeXm55s6dq27dusnj8ahv37567rnn6nTuP/7xD/3kJz9R3759FRsbq86dO+v73/++NmzYUGv/jRs3aty4cWrVqpXatGmj//qv/9Lu3bsD+uzYsUORkZHauHHjWb83AGdG8AUgSXrppZeUnp6uzz//XPfee6+WL1+ud955R9ddd51eeOEFTZ06NdRDbBJ33XWX1q5dq9WrV2vevHn64osvdNlll2nTpk0B/a699lqtXbtWa9as0QsvvKD8/HzdcMMNev311+v0OsYYzZgxQz/96U+Vmppa4/nNmzdXvebChQvP/o39x2effabzzjtPCxcu1LXXXqulS5dq+fLluueee7Rx40YNHTq0ztd65ZVXtHbtWq1cuVI//elP9cYbb2jkyJEqLCwM6PfEE09o7dq1+uSTT/Twww9rzZo1GjVqVJ1CdrDdfvvtmjdvnu644w598MEH+v/+v/9P06dP1xNPPHHGcxcsWKC9e/dq+vTpWrZsmf7whz8oOztbl1xyif7xj38E9N2+fbtGjx6tsrIyvfnmm1q0aJF27NihkSNH6siRI1X9evfurRtvvFEzZ84M+nsFUAsDwPHWrFlj3G63ueKKK0xJSUmN50tLS83//M//NOmYysrKTHl5uXnllVeMJPP555836uvt2bPHSDK//e1vA9o/+ugjI8nccsstVW2SzB133BHQb+/evUaS+c53vlOn11u2bJmRZLZv317r83fccYeRZK688kojyfz73/+u0WfOnDlGkjly5Eit1xgwYIAZNWpU1XFeXp7p2LGj6dKli8nMzKz1nLfeeuuMYz/VZ/Lwww8bSeYvf/mLMcaYjz/+2Eiqcc0///nPRpJ55JFHzvhawfTVV18Zl8tlnnjiiYD2n/70pyY6OtocPXr0tOcfPny4RltBQYFJSkoyY8eODWi/7rrrTGJiosnLy6tq27t3r4mIiDC/+MUvAvquX7/+lJ8xgOBixheAnnjiCblcLv3pT3+Sx+Op8XxkZKS+973vVR17vV49+eST6tu3rzwejzp06KDJkydr//79Aed169ZNN998c43rjR49OuCf4H3/JP7f//3f+vnPf67OnTvL4/Ho22+/repz/PhxTZkyRQkJCYqNjdVVV11V45+NJenDDz/U2LFjFRcXp5iYGI0YMUIfffRRA34q1iWXXCJJpyxH8ElNTVX79u11+PDhOl13wYIFuuiii9SnT58az5WUlOj1119Xenq6fv/730uSFi1aVM+R1/TSSy8pKytLTz75pFJSUmrtc+211zb4+nX9WQ0ZMkSS6vyzCpZ3331XxhhNmTIloH3KlCkqLi7W8uXLT3t+hw4darS1atVK/fv3V2ZmZlVbRUWF/vd//1fXXHON4uLiqtpTU1N12WWX6Z133gm4Rnp6uvr166cXXnihIW8LQD0QfAGHq6ys1D/+8Q+lp6erS5cudTrnZz/7me677z5dfvnl+vvf/65f/vKXWr58uYYPH37KetO6mD17tjIyMvTCCy/ovffeCwgaU6dOVVhYmF5//XU988wz+uyzzzR69Gjl5uZW9fnLX/6i8ePHKy4uTn/+85/15ptvKiEhQd/97ncbHH594bt9+/an7ZeXl6djx46pd+/eZ7xmWVmZPvzwQ1122WW1Pr906VIdP35cP/nJT9SrVy9deumlWrJkiU6cOFH/N1DNihUr5Ha7ddVVV53VdU6lrj+rPXv2SFKdflbGGFVUVNTpcSZfffWV2rdvr44dOwa0n3/++VXP11deXp42btyoAQMGVLXt2rVLxcXFVdc9+bW+/fZblZSUBLSPHj1a77//vowx9R4DgLoLD/UAAIRWTk6OioqKlJaWVqf+27dv15/+9CfdfvvtATcFXXjhhbr44ov1+9//Xr/61a8aNJYePXrorbfeqvW5IUOGBNS6DhgwQCNGjNDzzz+vBx98UEVFRZo+fbr+3//7fwEzahMnTtTgwYP1wAMPaN26dWccg9frVUVFhSorK/X1119r2rRpkqQbb7wxoJ8vkBljtHfvXt1zzz2KiYnRnDlzzvgamzdvVnFxsQYPHlzr8wsXLlRUVJRuuOEGSTb0T5kyRW+++aZ+8pOfnPH6p5KRkaH27dsrNja2wdeorrKyUhUVFSopKdGqVav0+OOPq3Xr1gH/OiD5f6bl5eXatGmTfv7zn6t///51ei+rVq065S8IJ9uzZ4+6det2yuePHj2qhISEGu2xsbGKjIzU0aNH6/Q61d1xxx0qLCzUgw8+GPA6kmp9rYSEBBljdPz4cSUnJ1e1Dx48WAsWLNA333yjvn371nscAOqG4AugXj7++GNJqlHCMHToUPXr108fffRRg4PvNddcc8rnTg6ew4cPV2pqqj7++GM9+OCDWrNmjY4dO6abbrqpxuzfFVdcoSeffFKFhYVnDH333Xef7rvvvqrjpKQkvfjii5o4cWJAv/nz52v+/PlVxxEREXrnnXeUnp5+xvd58OBBSbX/0/mePXv08ccf64c//KHatGkjSbruuut09913a9GiRWcVfOvDGKPKysqAtvDwwP9l+EobfM477zwtWLBASUlJAe2TJk0KOE5OTtaaNWuq3t/p+G64rItOnTqdsY/L5WrQc7V5+OGH9dprr+m5556r9XOvz2v5/iwcOHCA4As0IoIv4HCJiYmKiYmp+ufnM/HNZlWfrfLp1KnTGes7T6e2a/qc/M/TvjbfeHz1oqerUT127NgZg+/06dP1ox/9SGFhYWrTpo3S0tJqDTDXX3+97r33XpWXl2vLli2aPXu2fvCDH2jjxo3q1avXaV+juLhYkhQVFVXjuUWLFskYo2uvvTagjON73/ueXnvtNW3fvr0qGPmC6MkB1aeiokIRERFVx127dtXOnTvr9AtAbTOtJ8+ovvrqq+rXr5/Cw8OVlJR0ys/vN7/5jcaMGaOioiKtWLFC8+bN09VXX61169bVWlNeXatWrXTBBRecto/PycH8ZO3atdPmzZtrtBcWFqqsrKzWGdpTmTt3rh5//HH96le/0p133lnjdSTVOoN87NgxuVyuGqHf92fB92cDQOMg+AIO53a7NXbsWL3//vvav3//KW968vH9T/3QoUM1+h48eFCJiYlVx1FRUSotLa1xjZycnIB+PqebIcvKyqq1rWfPnpJUdb3nnnuuxkykz8kzkbVJSUmpuvnqdNq3b1/Vb9iwYerXr59GjRqlmTNn6n//939Pe65vrCevjez1erV48WJJ0n/913/Veu6iRYv05JNPSvK/nwMHDtR4b8YYHTp0KOC9fPe739WKFSv03nvv6Qc/+MFpx1jbTOvJM6r9+vWr08+qe/fuVf2+853vKDo6Wg899JCee+453XPPPac9N5i
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Calcul des valeurs de précision et de rappel à différents seuils\n",
|
|||
|
"precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)\n",
|
|||
|
"\n",
|
|||
|
"# Calcul de l'aire sous la courbe PR (AUC-PR)\n",
|
|||
|
"average_precision = average_precision_score(y_test, y_pred_prob)\n",
|
|||
|
"\n",
|
|||
|
"# Tracé de la courbe PR\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.step(recall, precision, color='b', alpha=0.2, where='post')\n",
|
|||
|
"plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')\n",
|
|||
|
"plt.xlabel('Rappel')\n",
|
|||
|
"plt.ylabel('Précision')\n",
|
|||
|
"plt.ylim([0.0, 1.05])\n",
|
|||
|
"plt.xlim([0.0, 1.0])\n",
|
|||
|
"plt.title(f'Courbe PR (AUC-PR = {average_precision:.2f})')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"id": "7fb157b6-4e4e-4c7d-8a37-c3ac99323795",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkkAAAHFCAYAAADmGm0KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABfgUlEQVR4nO3deVxU5f4H8M/sLMKwyZaIoogLrrgvKbnn0nLLupppmmlukVv5K9M2LLulpalpJuaSrXq1a+RuGq4omkq4hFuCmOIgi8PAPL8/kKMjAzI6wxzk83695sWcc55zzvcZvPG55zznGYUQQoCIiIiILCidXQARERGRHDEkEREREVnBkERERERkBUMSERERkRUMSURERERWMCQRERERWcGQRERERGQFQxIRERGRFQxJRERERFYwJBFRuRw5cgQvvPACateuDRcXF1SrVg0tWrTArFmzcPXqVafUFBcXB4VCgQMHDjjl/Nu3b4dCocD27dvtcrwzZ85AoVDgP//5T5ntatWqhaFDh1qsO3ToEDp37gy9Xg+FQoE5c+Zgw4YNmDFjhl1qI6qK1M4ugIjkb/HixRg9ejQiIiIwefJkNGzYECaTCQcOHMDChQuxe/durFmzxtllVhlr1qyBp6enxbphw4YhJycHq1evhre3N2rVqoX33nsPn3/+OYMS0T1iSCKiMu3evRsvv/wyunfvjrVr10Kn00nbunfvjokTJyI+Pr5CazKZTFAoFBV6Tjlp3rx5iXVHjx7FiBEj0Lt3bydURPRg4u02IipTbGwsFAoFFi1aZBGQimm1WvTv319aNpvNmDVrFurXrw+dTgd/f388//zzuHDhgsV+1m4ZAUCXLl3QpUsXabn4ltby5csxceJEPPTQQ9DpdDh16pTUJjMzEy+88AJ8fHzg7u6Ofv364a+//ipx7M2bN6Nr167w9PSEm5sbOnTogC1btpTrc/jzzz/Rq1cvuLm5wc/PD6NGjcL169ettr2f85TH7Z9d8S3HgoICLFiwAAqFAgqFAkOHDsXnn38OANI6hUKBM2fO2K0OogcdQxIRlaqwsBBbt25FVFQUQkJCyrXPyy+/jNdeew3du3fHunXr8O677yI+Ph7t27fHP//8c8+1TJ06FefOncPChQuxfv16+Pv7S9uGDx8OpVKJVatWYc6cOdi3bx+6dOmCa9euSW1WrFiBHj16wNPTE8uWLcN3330HHx8f9OzZ864B5tKlS+jcuTOOHj2K+fPnY/ny5cjOzsbYsWNLtL2f89yLPn36YPfu3QCAp556Crt378bu3bsxbdo0PPXUUwAgrdu9ezeCgoLsXgPRA0sQEZUiPT1dABDPPvtsudonJycLAGL06NEW6/fu3SsAiP/7v/+T1oWGhoohQ4aUOEbnzp1F586dpeVt27YJAOLhhx8u0Xbp0qUCgHjiiScs1v/+++8CgHjvvfeEEELk5OQIHx8f0a9fP4t2hYWFomnTpqJ169Zl9uu1114TCoVCJCUlWazv3r27ACC2bdtml/OkpqYKAOKjjz4qs521zw6AGDNmjMW6MWPGCP5nnuje8UoSEdnNtm3bAKDEbbTWrVujQYMG93Ul5V//+lep2wYNGmSx3L59e4SGhkr1JCQk4OrVqxgyZAgKCgqkl9lsRq9evbB//37k5OSUevxt27ahUaNGaNq0qcX6gQMHWizf73mISF44cJuISuXn5wc3NzekpqaWq/2VK1cAwOotneDgYJw9e/aeaynrNlFgYKDVdcX1XLp0CQCk20/WXL16Fe7u7la3XblyBbVr177ree/3PEQkLwxJRFQqlUqFrl274pdffsGFCxdQo0aNMtv7+voCANLS0kq0vXjxIvz8/KRlFxcXGI3GEsf4559/LNoVK+tptvT0dKvr6tatCwDS8ebOnYu2bdtaPUZAQECpx/f19S31HLe73/MQkbzwdhsRlWnq1KkQQmDEiBHIz88vsd1kMmH9+vUAgEceeQRA0eDl2+3fvx/Jycno2rWrtK5WrVo4cuSIRbsTJ04gJSXF5hpXrlxpsZyQkICzZ89KT8l16NABXl5eOH78OFq2bGn1pdVqSz1+dHQ0jh07hsOHD1usX7VqlcXy/Z7H3oqfRszLy6uwcxI9SHgliYjK1K5dOyxYsACjR49GVFQUXn75ZTRq1AgmkwmHDh3CokWLEBkZiX79+iEiIgIvvfQS5s6dC6VSid69e+PMmTOYNm0aQkJC8Oqrr0rHHTx4MJ577jmMHj0a//rXv3D27FnMmjUL1atXt7nGAwcO4MUXX8TTTz+N8+fP44033sBDDz2E0aNHAwCqVauGuXPnYsiQIbh69Sqeeuop+Pv74/Llyzh8+DAuX76MBQsWlHr8mJgYfPXVV+jTpw/ee+89BAQEYOXKlfjzzz8t2t3veYr98ccf+OGHH0qsb9WqFUJDQ8v9uTRu3BgA8OGHH6J3795QqVRo0qRJhQY1okrN2SPHiahySEpKEkOGDBE1a9YUWq1WuLu7i+bNm4u33npLZGRkSO0KCwvFhx9+KOrVqyc0Go3w8/MTzz33nDh//rzF8cxms5g1a5YICwsTLi4uomXLlmLr1q2lPt32/fffl6ip+Om2jRs3isGDBwsvLy/h6uoqHn30UXHy5MkS7Xfs2CH69OkjfHx8hEajEQ899JDo06eP1WPf6fjx46J79+7CxcVF+Pj4iOHDh4v//ve/Fk+33e95ip9uK+21dOlSIUT5n24zGo3ixRdfFNWrVxcKhUIAEKmpqXftKxEVUQghhHPiGREREZF8cUwSERERkRUMSURERERWMCQRERERWcGQRERERGQFQxIRERGRFQxJRERERFZwMslyMpvNuHjxIjw8PMr8egQiIiKSDyEErl+/juDgYCiVtl0bYkgqp4sXLyIkJMTZZRAREdE9OH/+/F2/f/JODEnl5OHhAaDoQ/b09HRyNURERFQeWVlZCAkJkf6O24IhqZyKb7F5enoyJBEREVUy9zJUhgO3iYiIiKxgSCIiIiKygiGJiIiIyAqOSSIiItkoLCyEyWRydhlUiahUKqjVaodMz8OQREREspCdnY0LFy5ACOHsUqiScXNzQ1BQELRarV2Py5BEREROV1hYiAsXLsDNzQ3Vq1fnpL1ULkII5Ofn4/Lly0hNTUV4eLjNE0aWhSGJiIiczmQyQQiB6tWrw9XV1dnlUCXi6uoKjUaDs2fPIj8/Hy4uLnY7NgduExGRbPAKEt0Le149sjiuQ45aTr/99hv69euH4OBgKBQKrF27tkSb5ORk9O/fH3q9Hh4eHmjbti3OnTsnbTcajRg3bhz8/Pzg7u6O/v3748KFCxbHyMzMxODBg6HX66HX6zF48GBcu3bNwb0jIiKiysypISknJwdNmzbFvHnzrG4/ffo0OnbsiPr162P79u04fPgwpk2bZnEpLSYmBmvWrMHq1auxa9cuZGdno2/fvigsLJTaDBw4EElJSYiPj0d8fDySkpIwePBgh/ePiIiIKjEhEwDEmjVrLNY988wz4rnnnit1n2vXrgmNRiNWr14trfv777+FUqkU8fHxQgghjh8/LgCIPXv2SG12794tAIg///yz3PUZDAYBQBgMhnLvQ0RE5ZOXlyeOHz8u8vLynF1KpWPt76cjbNu2TQAQmZmZ93yM6dOni6ZNm0rLQ4YMEY899pi0bDabxYgRI4S3t7cAIA4dOlSu45b17+d+/n7LdkyS2WzG//73P9SrVw89e/aEv78/2rRpY3FLLjExESaTCT169JDWBQcHIzIyEgkJCQCA3bt3Q6/Xo02bNlKbtm3bQq/XS22sMRqNyMrKsngRERHdKT09HePGjUNYWBh0Oh1CQkLQr18/bNmyxdmlyd6nn36KuLg4aTk+Ph5xcXH4+eefkZaWhsjIyFKH41QE2YakjIwMZGdn44MPPkCvXr2wceNGPPHEE3jyySexY8cOAEX/MLVaLby9vS32DQgIQHp6utTG39+/xPH9/f2lNtbMnDlTGsOk1+sREhJix97d8uXOvzDp+8NIPHvVIccnIiLHOXPmDKKiorB161bMmjULf/zxB+Lj4xEdHY0
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# utilisation d'une métrique plus adaptée aux modèles de marketing : courbe de lift\n",
|
|||
|
"\n",
|
|||
|
"# Tri des prédictions de probabilités et des vraies valeurs\n",
|
|||
|
"sorted_indices = np.argsort(y_pred_prob)[::-1]\n",
|
|||
|
"y_pred_prob_sorted = y_pred_prob[sorted_indices]\n",
|
|||
|
"y_test_sorted = y_test.iloc[sorted_indices]\n",
|
|||
|
"\n",
|
|||
|
"# Calcul du gain cumulatif\n",
|
|||
|
"cumulative_gain = np.cumsum(y_test_sorted) / np.sum(y_test_sorted)\n",
|
|||
|
"\n",
|
|||
|
"# Tracé de la courbe de lift\n",
|
|||
|
"plt.plot(np.linspace(0, 1, len(cumulative_gain))[:10000], (cumulative_gain/np.linspace(0, 1, len(cumulative_gain)))[:10000], label='Courbe de lift')\n",
|
|||
|
"plt.xlabel('Pourcentage des données')\n",
|
|||
|
"plt.ylabel('Gain cumulatif')\n",
|
|||
|
"plt.title('Courbe de Lift')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "98b93d38-a5d7-4480-91e6-e79be5de18e7",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Random forest"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"id": "771bee72-8b12-4ffb-b3ce-82f7e2ba6a8d",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Fitting 3 folds for each of 9 candidates, totalling 27 fits\n",
|
|||
|
"Best parameters: {'max_depth': 20, 'n_estimators': 100, 'random_state': 20}\n",
|
|||
|
"Best classification accuracy in train is: 0.3224906065485776\n",
|
|||
|
"Classification accuracy on test is: 0.31906614785992216\n",
|
|||
|
"------\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Define models and parameters for GridSearch\n",
|
|||
|
"params = {\n",
|
|||
|
" 'n_estimators': [100, 150, 200],\n",
|
|||
|
" 'max_depth': [5, 20, 30],\n",
|
|||
|
" 'random_state' : [20]\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"# define model and pipeline - no preprocessing\n",
|
|||
|
"clf = GridSearchCV(RandomForestClassifier(), params, cv=3, scoring=f1_scorer, verbose=True)\n",
|
|||
|
"clf.fit(X_train, y_train)\n",
|
|||
|
"\n",
|
|||
|
"print(f\"Best parameters: {clf.best_params_}\")\n",
|
|||
|
"print('Best classification accuracy in train is: {}'.format(clf.best_score_))\n",
|
|||
|
"print('Classification accuracy on test is: {}'.format(clf.score(X_test, y_test)))\n",
|
|||
|
"print(\"------\")"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"id": "bf44a84d-607e-48c3-b8c6-28a07d1b1c14",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Accuracy: 0.99863492410178\n",
|
|||
|
"Confusion Matrix:\n",
|
|||
|
" [[127982 55]\n",
|
|||
|
" [ 120 41]]\n",
|
|||
|
"Classification Report:\n",
|
|||
|
" precision recall f1-score support\n",
|
|||
|
"\n",
|
|||
|
" 0.0 1.00 1.00 1.00 128037\n",
|
|||
|
" 1.0 0.43 0.25 0.32 161\n",
|
|||
|
"\n",
|
|||
|
" accuracy 1.00 128198\n",
|
|||
|
" macro avg 0.71 0.63 0.66 128198\n",
|
|||
|
"weighted avg 1.00 1.00 1.00 128198\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# visualisation des résultats \n",
|
|||
|
"\n",
|
|||
|
"y_pred = clf.predict(X_test)\n",
|
|||
|
"\n",
|
|||
|
"#Evaluation du modèle \n",
|
|||
|
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|||
|
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
|
|||
|
"class_report = classification_report(y_test, y_pred)\n",
|
|||
|
"\n",
|
|||
|
"print(\"Accuracy:\", accuracy)\n",
|
|||
|
"print(\"Confusion Matrix:\\n\", conf_matrix)\n",
|
|||
|
"print(\"Classification Report:\\n\", class_report)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 33,
|
|||
|
"id": "0fa2189c-5c0a-405b-b686-b9df3958c85c",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjYAAAHFCAYAAADhWLMfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABUH0lEQVR4nO3deVxVdf7H8dcV4YooNxQBMc2NSNJJw1K00nJXJGdq1CjSMjLXSFyicmsBt9HKvU0by2gxGytlNC0bU5RUyq3MRMkE0URURCA4vz/8eWeuYILdRa/v5zzO4+E953PP+dzrEB8/3+/3HJNhGAYiIiIibqCKqxMQERERsRcVNiIiIuI2VNiIiIiI21BhIyIiIm5DhY2IiIi4DRU2IiIi4jZU2IiIiIjbUGEjIiIibkOFjYiIiLgNFTbi1r7//nseeeQRGjVqRLVq1ahRowa33nor06ZN4/jx4w699vbt2+nQoQMWiwWTycTLL79s92uYTCYmTZpk9/NeyuLFizGZTJhMJr766qsyxw3DoGnTpphMJjp27HhZ15g3bx6LFy+u1Hu++uqri+YkIteGqq5OQMRRXn/9dYYOHUpoaChjxowhLCyM4uJivv32WxYsWMCmTZtYvny5w67/6KOPkp+fT3JyMn5+fjRs2NDu19i0aRPXX3+93c9bUTVr1uTNN98sU7ysX7+en3/+mZo1a172uefNm4e/vz8DBw6s8HtuvfVWNm3aRFhY2GVfV0SubipsxC1t2rSJIUOG0KVLFz755BPMZrP1WJcuXYiPjyclJcWhOezcuZPY2Fh69OjhsGu0bdvWYeeuiH79+vHuu+8yd+5cfH19rfvffPNNIiIiOHnypFPyKC4uxmQy4evr6/LvRERcS0NR4pYSExMxmUy89tprNkXNeV5eXkRFRVlfl5aWMm3aNG666SbMZjMBAQE8/PDDHDp0yOZ9HTt2pHnz5qSlpXHnnXdSvXp1GjduzJQpUygtLQX+O0zz+++/M3/+fOuQDcCkSZOsf/5f599z4MAB675169bRsWNHateujbe3Nw0aNOC+++7jzJkz1pjyhqJ27tzJvffei5+fH9WqVaNly5a8/fbbNjHnh2zee+89nn32WYKDg/H19aVz5878+OOPFfuSgQceeACA9957z7ovLy+PZcuW8eijj5b7nsmTJ9OmTRtq1aqFr68vt956K2+++Sb/+zzehg0bsmvXLtavX2/9/s53vM7nvmTJEuLj46lXrx5ms5l9+/aVGYo6duwY9evXp127dhQXF1vPv3v3bnx8fIiJianwZxWRq4MKG3E7JSUlrFu3jvDwcOrXr1+h9wwZMoRx48bRpUsXVqxYwQsvvEBKSgrt2rXj2LFjNrHZ2dk8+OCDPPTQQ6xYsYIePXqQkJDAO++8A0CvXr3YtGkTAPfffz+bNm2yvq6oAwcO0KtXL7y8vHjrrbdISUlhypQp+Pj4UFRUdNH3/fjjj7Rr145du3bx6quv8vHHHxMWFsbAgQOZNm1amfhnnnmGgwcP8sYbb/Daa6/x008/0bt3b0pKSiqUp6+vL/fffz9vvfWWdd97771HlSpV6Nev30U/2+DBg/nggw/4+OOP+dvf/saIESN44YUXrDHLly+ncePGtGrVyvr9XThsmJCQQGZmJgsWLODTTz8lICCgzLX8/f1JTk4mLS2NcePGAXDmzBn+/ve/06BBAxYsWFChzykiVxFDxM1kZ2cbgNG/f/8Kxe/Zs8cAjKFDh9rs37x5swEYzzzzjHVfhw4dDMDYvHmzTWxYWJjRrVs3m32AMWzYMJt9EydONMr7sVu0aJEBGBkZGYZhGMZHH31kAEZ6evof5g4YEydOtL7u37+/YTabjczMTJu4Hj16GNWrVzdOnDhhGIZhfPnllwZg9OzZ0ybugw8+MABj06ZNf3jd8/mmpaVZz7Vz507DMAzjtttuMwYOHGgYhmHcfPPNRocOHS56npKSEqO4uNh4/vnnjdq1axulpaXWYxd77/nr3XXXXRc99uWXX9rsnzp1qgEYy5cvNwYMGGB4e3sb33///R9+RhG5OqljI9e8L7/8EqDMJNXbb7+dZs2asXbtWpv9QUFB3H777Tb7/vKXv3Dw4EG75dSyZUu8vLx4/PHHefvtt9m/f3+F3rdu3To6depUplM1cOBAzpw5U6Zz9L/DcXDucwCV+iwdOnSgSZMmvPXWW+zYsYO0tLSLDkOdz7Fz585YLBY8PDzw9PRkwoQJ/Pbbb+Tk5FT4uvfdd1+FY8eMGUOvXr144IEHePvtt5k9ezYtWrSo8PtF5Oqhwkbcjr+/P9WrVycjI6NC8b/99hsAdevWLXMsODjYevy82rVrl4kzm80UFBRcRrbla9KkCV988QUBAQEMGzaMJk2a0KRJE1555ZU/fN9vv/120c9x/vj/uvCznJ+PVJnPYjKZeOSRR3jnnXdYsGABN954I3feeWe5sVu2bKFr167AuVVr33zzDWlpaTz77LOVvm55n/OPchw4cCBnz54lKChIc2tE3JgKG3E7Hh4edOrUia1bt5aZ/Fue87/cs7Kyyhw7fPgw/v7+dsutWrVqABQWFtrsv3AeD8Cdd97Jp59+Sl5eHqmpqURERBAXF0dycvJFz1+7du2Lfg7Arp/lfw0cOJBjx46xYMECHnnkkYvGJScn4+npyWeffUbfvn1p164drVu3vqxrljcJ+2KysrIYNmwYLVu25LfffmP06NGXdU0RufKpsBG3lJCQgGEYxMbGljvZtri4mE8//RSAe+65B8A6+fe8tLQ09uzZQ6dOneyW1/mVPd9//73N/vO5lMfDw4M2bdowd+5cALZt23bR2E6dOrFu3TprIXPeP//5T6pXr+6wpdD16tVjzJgx9O7dmwEDBlw0zmQyUbVqVTw8PKz7CgoKWLJkSZlYe3XBSkpKeOCBBzCZTKxatYqkpCRmz57Nxx9//KfPLSJXHt3HRtxSREQE8+fPZ+jQoYSHhzNkyBBuvvlmiouL2b59O6+99hrNmzend+/ehIaG8vjjjzN79myqVKlCjx49OHDgAOPHj6d+/fo89dRTdsurZ8+e1KpVi0GDBvH8889TtWpVFi9ezC+//GITt2DBAtatW0evXr1o0KABZ8+eta486ty580XPP3HiRD777DPuvvtuJkyYQK1atXj33Xf5/PPPmTZtGhaLxW6f5UJTpky5ZEyvXr2YOXMm0dHRPP744/z222/MmDGj3CX5LVq0IDk5mffff5/GjRtTrVq1y5oXM3HiRP7zn/+wevVqgoKCiI+PZ/369QwaNIhWrVrRqFGjSp9TRK5cKmzEbcXGxnL77bcza9Yspk6dSnZ2Np6entx4441ER0czfPhwa+z8+fNp0qQJb775JnPnzsVisdC9e3eSkpLKnVNzuXx9fUlJSSEuLo6HHnqI6667jscee4wePXrw2GOPWeNatmzJ6tWrmThxItnZ2dSoUYPmzZuzYsUK6xyV8oSGhrJx40aeeeYZhg0bRkFBAc2aNWPRokWVuoOvo9xzzz289dZbTJ06ld69e1OvXj1iY2MJCAhg0KBBNrGTJ08mKyuL2NhYTp06xQ033GBzn5+KWLNmDUlJSYwfP96m87Z48WJatWpFv3792LBhA15eXvb4eCJyBTAZxv/cFUtERETkKqY5NiIiIuI2VNiIiIiI21BhIyIiIm5DhY2IiIi4DRU2IiIi4jZU2IiIiIjbUGEjIiIibsMtb9Dn3Wr4pYNErkG5aXNcnYLIFaeaE34T2uv3UsF2/Qxfijo2IiIi4jbcsmMjIiJyRTGpj+AsKmxEREQczWRydQbXDBU2IiIijqaOjdPomxYRERG3oY6NiIiIo2koymlU2IiIiDiahqKcRt+0iIiIuA11bERERBxNQ1FOo8JGRETE0TQU5TT6pkVERMRtqGMjIiLiaBqKchoVNiIiIo6moSin0TctIiIibkMdGxEREUfTUJTTqLARERFxNA1FOY0KGxEREUdTx8ZpVEKKiIiI21BhIyIi4mimKvbZKunrr7+md+/eBAcHYzKZ+OSTT6zHiouLGTduHC1atMDHx4f
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 2 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# matrice de confusion\n",
|
|||
|
"\n",
|
|||
|
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
|
|||
|
"plt.xlabel('Predicted')\n",
|
|||
|
"plt.ylabel('Actual')\n",
|
|||
|
"plt.title('Confusion Matrix')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"id": "311f0208-b79e-4e80-8016-075a98708f6e",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAArMAAAIhCAYAAABdSTJTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAACFfElEQVR4nO3dd1hTZxsG8DuEjYCigKKI21K3WBWtW3FvBTfWvVer1bpHtVqt1rotat1YReuqinXvhVqFOnGDCiggG/J+f/B5NIJKgOSQcP+ui6vJm3NynhCKN0/e8x6FEEKAiIiIiEgPGcldABERERFRZjHMEhEREZHeYpglIiIiIr3FMEtEREREeothloiIiIj0FsMsEREREekthlkiIiIi0lsMs0RERESktxhmiYiIiEhvMcwS5XLXr1/HN998g+LFi8Pc3Bx58uRB1apVMW/ePERERMhS07p166BQKHDp0iWtHufBgwdQKBTSl5GREfLly4dGjRrh0KFDH93vwIEDaNmyJezt7WFmZgZnZ2d4e3sjMDDwo/ucPHkSnp6eKFy4MExNTWFra4tatWph+fLliImJ0cbLk83b7+u6devkLkUSERGBLl26wMHBAQqFAu3atZO7pDTOnDmDadOm4fXr13KXQqRXGGaJcrHVq1fDzc0NFy9exNixY3HgwAHs3LkTnTt3xooVK9C3b1+5S9SJ4cOH4+zZszh58iTmz5+PO3fuoEWLFjhx4kSabceNG4fmzZtDpVJh2bJl8Pf3x9SpU3Hx4kVUrVoVfn5+afaZOnUq6tati6dPn2LmzJnw9/fH1q1b0ahRI0ybNg2TJk3SxcvM1WbOnImdO3di4cKFOHv2LObNmyd3SWmcOXMG06dPZ5gl0pQgolzpzJkzQqlUimbNmon4+Pg0jyckJIi//vpLpzUlJiaKpKQksXbtWgFAXLx4UavHCw4OFgDEzz//rDZ+/PhxAUD06tVLbXzz5s0CgBg8eHCa53rz5o1wc3MTlpaW4t69e9L4tm3bBADRt29foVKp0uwXFRUlDh48mE2vKH0xMTFaff4Pvf2+rl27VqfH/ZTGjRsLV1fXbHs+lUolYmNjs+35hBDi559/FgBEcHBwtj4vkaFjZ5Yol5o9ezYUCgVWrVoFMzOzNI+bmpqiTZs20n2VSoV58+bhiy++gJmZGRwcHNCrVy88efJEbb9ixYqhd+/eaZ6vfv36qF+/vnT/2LFjUCgU2LBhA7799lsULlwYZmZmuHv3rrTNq1ev8M0338DOzg5WVlZo3bo17t+/n+a5Dx8+jEaNGsHGxgaWlpaoXbs2/vnnn0x8V1JVq1YNAPD8+XO18R9//BH58uXD/Pnz0+xjZWWF3377DbGxsVi4cKE0PmPGDOTLlw+LFy+GQqFIs5+1tTU8PDwyXeuH6tevj/Lly+PEiROoVasWLC0t0adPHwCAr68vPDw8UKhQIVhYWMDV1RXjx49PM82hd+/eyJMnD+7evYsWLVogT548cHZ2xrfffouEhAS1bZ89ewZPT09YW1vD1tYWXl5eCA0NTbe23bt3w93dHZaWlrC2tkaTJk1w9uxZtW2mTZsGhUKB69evo3PnzrC1tYWdnR3GjBmD5ORk3Lp1C82aNYO1tTWKFSv22Q7r2ykPhw8fRlBQkDSl5NixYwBSpx8MGTJEmv5RokQJTJw4Mc3rVCgUGDZsGFasWAFXV1eYmZnhjz/+AADcuXMH3bp1g4ODA8zMzODq6oqlS5eq7a9SqTBr1iyULVsWFhYWyJs3LypWrIhff/1Vet1jx44FABQvXjxNnUT0cQyzRLlQSkoKjhw5Ajc3Nzg7O2don8GDB+P7779HkyZNsHv3bsycORMHDhxArVq1EBYWlulaJkyYgEePHmHFihXYs2cPHBwcpMf69u0LIyMjbN68GYsWLcKFCxdQv359tY9hN27cCA8PD9jY2OCPP/7Atm3bYGdnh6ZNm2Y60AYHBwMAypQpI42FhITg5s2b8PDwgKWlZbr7ubu7w8HBAf7+/tI+N27c+OQ+GfE2+E+bNi1D24eEhKBHjx7o1q0b9u/fjyFDhgCANH3Cx8cHBw4cwKhRo7Bt2za0bt06zXMkJSWhTZs2aNSoEf766y/06dMHCxcuxNy5c6Vt4uLi0LhxYxw6dAhz5szBn3/+iYIFC8LLyyvN823evBlt27aFjY0NtmzZAh8fH7x69Qr169fHqVOn0mzv6emJSpUqYceOHejfvz8WLlyI0aNHo127dmjZsiV27tyJhg0b4vvvv093asdbhQoVwtmzZ1GlShWUKFECZ8+exdmzZ1G1alXEx8ejQYMGWL9+PcaMGYN9+/ahR48emDdvHjp06JDmuXbt2oXly5djypQpOHjwIOrUqYPAwEB89dVXuHHjBhYsWIC9e/eiZcuWGDFiBKZPny7tO2/ePEybNg1du3bFvn374Ovri759+0o/y/369cPw4cMBAH5+fmp1EtFnyN0aJiLdCw0NFQBEly5dMrR9UFCQACCGDBmiNn7+/HkBQPzwww/SmIuLi/D29k7zHPXq1RP16tWT7h89elQAEHXr1k2z7dtpBu3bt1cbP336tAAgZs2aJYRI/fjczs5OtG7dWm27lJQUUalSJVG9evVPvq63H4fPnTtXJCUlifj4eHH16lXh7u4uChUqpPZx77lz5wQAMX78+E8+Z40aNYSFhYVG+3zOsWPHhFKpFNOnT//stvXq1RMAxD///PPJ7VQqlUhKSpKmVFy7dk16zNvbWwAQ27ZtU9unRYsWomzZstL95cuXCwBppqP0799fbZpBSkqKcHJyEhUqVBApKSnSdtHR0cLBwUHUqlVLGps6daoAIBYsWKD2nJUrVxYAhJ+fnzSWlJQk7O3tRYcOHT7zXUn9vpQrV05tbMWKFem+zrlz5woA4tChQ9IYAGFraysiIiLUtm3atKkoUqSIiIyMVBsfNmyYMDc3l7Zv1aqVqFy58idr5DQDosxhZ5aIPuvo0aMAkGb6QPXq1eHq6pqlj/Q7duz40ce6d++udr9WrVpwcXGR6jlz5gwiIiLg7e2N5ORk6UulUqFZs2a4ePFihlYK+P7772FiYgJzc3NUrlwZN27cwJ49e1CsWDGNX48QIt3pBFlRr149JCcnY8qUKRnaPl++fGjYsGGa8fv376Nbt24oWLAglEolTExMUK9ePQBAUFCQ2rYKhSJNx7ZixYp4+PChdP/o0aOwtrZWm44CAN26dVO7f+vWLTx79gw9e/aEkdG7f3by5MmDjh074ty5c4iNjVXbp1WrVmr3XV1doVAo0Lx5c2nM2NgYpUqVUqtJE0eOHIGVlRU6deqkNv725/zDn+uGDRsiX7580v34+Hj8888/aN++PSwtLdV+Blu0aIH4+HicO3cOQOr/K9euXcOQIUNw8OBBREVFZapmIkrLWO4CiEj3ChQoAEtLS+nj9M8JDw8HkPqR7YecnJwyHSY+9pxvFSxYMN2xt/W8ndP6YRh5X0REBKysrD5Zw8iRI9GjRw8kJCTg3LlzmDRpEtq2bYtr164hf/78AICiRYsCwGe/Zw8fPpSmbmR0n+yW3vf0zZs3qFOnDszNzTFr1iyUKVMGlpaWePz4MTp06IC4uDi17S0tLWFubq42ZmZmhvj4eOl+eHg4HB0d0xzrw/ftcz8/KpUKr169UpuKYWdnp7adqalpujWZmppmOhiGh4ejYMGCaf74cHBwgLGxsVT3Wx/WHx4ejuTkZPz222/47bff0j3G2yk4EyZMgJWVFTZu3IgVK1ZAqVSibt26mDt3rjRHm4gyh2GWKBdSKpVo1KgR/v77bzx58gRFihT55PZvA11ISEiabZ89e4YCBQpI983NzdOcPAOk/qP+/nZvfaqLmd6JRKGhoShVqhQASM/322+/oWbNmuk+R3ph60NFihSRAkXt2rVRsGBB9OjRA1OnTsWSJUsApAaZcuXK4dChQ4iNjU13DuzZs2fx/PlzdO7cWdqnQoUKn9xHG9L7nh45cgTPnj3DsWPHpG4sgCwtA5U/f35cuHAhzfiH79v7Pz8fevbsmbS+r67lz58f58+
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# on trace la courbe ROC\n",
|
|||
|
"\n",
|
|||
|
"# Prédictions sur l'ensemble de test\n",
|
|||
|
"y_pred_prob = clf.predict_proba(X_test)[:, 1]\n",
|
|||
|
"\n",
|
|||
|
"# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)\n",
|
|||
|
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)\n",
|
|||
|
"\n",
|
|||
|
"# Calcul de l'aire sous la courbe ROC (AUC)\n",
|
|||
|
"roc_auc = auc(fpr, tpr)\n",
|
|||
|
"\n",
|
|||
|
"# Tracé de la courbe ROC\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')\n",
|
|||
|
"plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n",
|
|||
|
"plt.xlabel('Taux de faux positifs (FPR)')\n",
|
|||
|
"plt.ylabel('Taux de vrais positifs (TPR)')\n",
|
|||
|
"plt.title('Courbe ROC : random forest')\n",
|
|||
|
"plt.legend(loc='lower right')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"id": "e20e9ac2-7232-4418-87f0-c7299a6d7de3",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAr4AAAIhCAYAAACot7njAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABQI0lEQVR4nO3deXRUVb728acq80ASIRCmEECmAIoaQAGRUVS42N6rLY4MDSqODGoLToja0mpr29gCtoK0iorS6tXbiOAAIiBtGBQFAQkQpoBhSAKBTLXfP85bSSqpjFRSSc73s1YtUrv2OfWrHNAnO/vs7TDGGAEAAAANnNPfBQAAAAC1geALAAAAWyD4AgAAwBYIvgAAALAFgi8AAABsgeALAAAAWyD4AgAAwBYIvgAAALAFgi8AAABsgeALoNCPP/6ocePGqV27dgoNDVVkZKQuuugiPffcczp27Jhfalq4cKEcDoeSk5Nr9H327Nkjh8NR+HA6nWrSpImGDx+udevWefQt3s/hcCgqKkp9+/bVu+++W6X3XL16tUJCQrR3716vr1900UVyOBz6y1/+4vX1J554Qg6HQ+np6V5f7969uwYOHFiq/fDhw5o2bZrOO+88RUZGKjQ0VB07dtSkSZO0c+fOCut2XxP3IzAwUK1bt9a4ceN04MCBwn4rV6706BcQEKCmTZtq5MiRNX49y5KXl6eZM2eqbdu2CgkJUZcuXfTyyy9X6tjNmzdrxIgRatOmjcLCwtS4cWP16dNHb7/9tke/goICvfjii7ryyivVunVrhYeHKzExUdOmTdOJEyc8+u7YsUPBwcHauHGjrz4igHIE+rsAAHXDa6+9prvuukudO3fWgw8+qK5duyovL0/JycmaN2+e1q1bp48++sjfZda4e++9VzfddJMKCgr0888/a+bMmRo0aJDWrVunCy+8sLDfddddp/vvv1/GGO3evVvPPPOMbrrpJhljdNNNN1X4PsYYTZ48WbfddpsSEhJKvb5582Zt2rRJkjR//nw98MADPvl8//nPf/Rf//VfMsbonnvuUZ8+fRQcHKzt27fr7bffVu/evXX8+PFKneuNN95Qly5ddPr0aX3zzTeaNWuWVq1apS1btigiIqKw3zPPPKNBgwYpLy9PmzZt0syZMzVgwABt3rxZHTt29Mnnqqy77rpLb731lp566in16tVLn3/+uSZNmqSsrCw9/PDD5R574sQJxcfH68Ybb1SrVq106tQpLVq0SLfeeqv27NmjRx99VJJ0+vRpPfHEE7rxxhs1YcIExcbGauPGjXr66af16aefKjk5WWFhYZKkTp066eabb9aUKVO0atWqGv/8gO0ZALa3du1aExAQYK688kpz5syZUq/n5OSY//3f/63VmnJzc01eXp554403jCTz/fff1+j77d6920gyzz//vEf7l19+aSSZCRMmFLZJMnfffbdHvz179hhJ5rLLLqvU+y1dutRIMr/88ovX1++++24jyYwYMcJIMmvWrCnVZ8aMGUaS+e2337yeo1u3bmbAgAGFzzMyMkzz5s1NfHy82bdvn9djPvjggwprL+uaPPbYY0aSefvtt40xxnz99ddGUqlz/vOf/zSSzOOPP17he/nSTz/9ZBwOh3nmmWc82m+77TYTFhZmjh49Wq3zXnzxxSY+Pr7weX5+vklPTy/V74MPPjCSzFtvveXRnpycXOY1BuBbTHUAoGeeeUYOh0P/+Mc/FBISUur14OBgXX311YXPXS6XnnvuOXXp0kUhISFq1qyZRo8erf3793sc17ZtW40dO7bU+QYOHOjxK3j3r8Tfeust3X///WrVqpVCQkL066+/FvY5fvy4xo0bp8aNGysiIkIjR45USkpKqXN/8cUXGjJkiKKiohQeHq5+/frpyy+/rMZ3xXLJJZdIUpnTEdwSEhLUtGlTHT58uFLnnTt3rnr16qXOnTuXeu3MmTN65513lJSUpL/+9a+SpAULFlSx8tJee+01paWl6bnnnlPr1q299rnuuuuqff7Kfq969uwpSZX+XvnKxx9/LGOMxo0b59E+btw4nT59WsuWLavWeWNjYxUYWPQL1ICAADVp0qRUv969e0uS9u3b59GelJSkxMREzZs3r1rvD6DyCL6AzRUUFOirr75SUlKS4uPjK3XMnXfeqYceekiXX365PvnkEz311FNatmyZ+vbtW+Z808qYPn26UlNTNW/ePH366adq1qxZ4Wvjx4+X0+nUO++8o5deekn/+c9/NHDgQI85k2+//baGDRumqKgo/fOf/9T777+vxo0b64orrqh2+HWH76ZNm5bbLyMjQ8eOHVOnTp0qPGdubq6++OILDRo0yOvrH374oY4fP64//OEP6tixoy699FItXrxYJ0+erPoHKGb58uUKCAjQyJEjz+o8Zans92r37t2SVKnvlTFG+fn5lXpU5KefflLTpk3VvHlzj/bzzz+/8PXKcLlcys/P12+//aY5c+bo888/10MPPVThcV999ZUkqVu3bqVeGzhwoD777DMZYypVA4Bq8vOIMwA/S0tLM5LMDTfcUKn+27ZtM5LMXXfd5dG+fv16I8k8/PDDhW0JCQlmzJgxpc4xYMAAj1/Bu38l7m2agPvX6v/93//t0b5mzRojyTz99NPGGGNOnTplGjdubEaOHOnRr6CgwPTo0cP07t273M/lnurw7LPPmry8PHPmzBmzYcMG06tXLyPJ/Pvf/y7s6/78eXl5Jjc31+zYscNcffXVplGjRiY5Obnc9zGm6Hv13nvveX198ODBJjQ01Bw/ftzjezB//nyPflWd6tClSxfTvHnzCuuriLue7777zuTl5ZmsrCzzf//3f6Zp06amUaNGJi0tzRhTdF0XL15s8vLyTHZ2tlmzZo3p3Lmz6dq1a+HnK4/7HJV57N69u9xzXX755aZz585eXwsODja33357pT7/HXfcUfiewcHBZs6cORUes3//fhMXF2d69uxpCgoKSr3+2muvGUlm27ZtlaoBQPVwcxuAKvn6668lqdQUht69eysxMVFffvml/vSnP1Xr3Ndee22Zr918880ez/v27auEhAR9/fXXeuSRR7R27VodO3ZMY8aMKTX6d+WVV+q5557TqVOnPG668uahhx7yGL2Li4vTq6++quHDh3v0mzNnjubMmVP4PCgoSB999JGSkpIq/JwHDx6UJI8Rbbfdu3fr66+/1o033qiYmBhJ0u9//3vdd999WrBggf7whz9UeH5fMMaooKDAo634r/OloqkNbuedd57mzp2ruLg4j/ZRo0Z5PG/RooXWrl1b+PnKk5SUpO+//75SNbds2bLCPg6Ho1qvFffwww9rwoQJOnLkiD799FPdc889OnXqVJk3IB47dkzDhw+XMUaLFy+W01n6l63uvwsHDhxQly5dKlUHgKoj+AI2Fxsbq/Dw8MJfP1fk6NGjkqzwUlLLli0rnN9ZHm/ndCv562l3m7se93zR8uaoHjt2rMLgO2nSJN1yyy1yOp2KiYlRu3btvAai66+/Xg8++KDy8vK0ZcsWTZ8+XTfccIM2btxY4UoFp0+fliSFhoaWem3BggUyxui6667zmMZx9dVXa9GiRfrll18Kg5E7iJYMqG75+fkKCgoqfN6mTRvt3LmzUj8ArFq1qtRUjN27d6tt27aFz998800lJiYqMDBQcXFxZV6/Z599VoMHD1Z2draWL1+uWbNm6ZprrtH69eu9zikvLjIyUhdccEG5fdxKBvOSmjRpos2bN5dqP3XqlHJzc9W4ceNKvU+bNm3Upk0bSSr8gWj69OkaM2ZMqWkex48f1+WXX64DBw7oq6++Uvv27b2e0/13wf13A0DNIPgCNhcQEKAhQ4bos88+0/79+8u86cnNfdPOoUOHSvU9ePCgYmNjC5+HhoYqJyen1DnS09M9+rmVN+KWlpbmta1Dhw6SVHi+l19+udRIpFvJkUhvWrduXXjzVXmaNm1a2K9Pnz5KTEzUgAEDNGXKFP3f//1fuce6ay25NrLL5dLChQslSf/zP//j9dgFCxboueeek1T0eQ4cOFDqsxljdOjQIY/PcsUVV2j
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 800x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Calcul des valeurs de précision et de rappel à différents seuils\n",
|
|||
|
"precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)\n",
|
|||
|
"\n",
|
|||
|
"# Calcul de l'aire sous la courbe PR (AUC-PR)\n",
|
|||
|
"average_precision = average_precision_score(y_test, y_pred_prob)\n",
|
|||
|
"\n",
|
|||
|
"# Tracé de la courbe PR\n",
|
|||
|
"plt.figure(figsize=(8, 6))\n",
|
|||
|
"plt.step(recall, precision, color='b', alpha=0.2, where='post')\n",
|
|||
|
"plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')\n",
|
|||
|
"plt.xlabel('Rappel')\n",
|
|||
|
"plt.ylabel('Précision')\n",
|
|||
|
"plt.ylim([0.0, 1.05])\n",
|
|||
|
"plt.xlim([0.0, 1.0])\n",
|
|||
|
"plt.title(f'Courbe PR (AUC-PR = {average_precision:.2f})')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"id": "0633df2d-686e-4f9d-823e-e54c23f983f8",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkAAAAHFCAYAAAAaD0bAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABg1klEQVR4nO3deVhUZf8G8HuYjX2XTRFBwQ1X3JfUBJdMMyvr1czKVrdIy7JNM8OyN7VNezUTc8lW+2kZuZOKpiK4Im6oaCAuyL4MM8/vD2BkBJXRmTkDc3+uay7mnPPMOd9nsJf7fc5zzpEJIQSIiIiIbIid1AUQERERWRoDEBEREdkcBiAiIiKyOQxAREREZHMYgIiIiMjmMAARERGRzWEAIiIiIpvDAEREREQ2hwGIiIiIbA4DEBHh0KFDeOaZZxAcHAx7e3s4OzujY8eOmDt3Lq5duyZJTbGxsZDJZNi/f78kx9++fTtkMhm2b99ukv2dPXsWMpkM//3vf2/brkmTJnj66acN1iUlJaFPnz5wc3ODTCbDggULsGHDBsycOdMktRHZIoXUBRCRtJYsWYLx48ejefPmeP3119GqVStoNBrs378fX3/9NXbv3o21a9dKXabNWLt2LVxdXQ3WPfvssygoKMCaNWvg4eGBJk2aYPbs2fjqq68YgojuEgMQkQ3bvXs3Xn75ZURFReG3336DWq3Wb4uKisLUqVMRFxdn0Zo0Gg1kMplFj2lNOnToUG3dkSNH8Pzzz2Pw4MESVERUP/EUGJENi4mJgUwmw+LFiw3CTyWVSoVhw4bpl3U6HebOnYsWLVpArVbDx8cHTz31FC5cuGDwuZpO4wBA37590bdvX/1y5WmmFStWYOrUqWjYsCHUajVOnTqlb5OdnY1nnnkGnp6ecHJywtChQ3HmzJlq+968eTP69+8PV1dXODo6omfPntiyZUutvofjx49j0KBBcHR0hLe3N1566SXk5eXV2PZejlMbVb+7ytOAZWVlWLRoEWQyGWQyGZ5++ml89dVXAKBfJ5PJcPbsWZPVQVTfMQAR2SitVoutW7ciIiICgYGBtfrMyy+/jDfeeANRUVFYt24dPvjgA8TFxaFHjx64cuXKXdcyffp0nD9/Hl9//TXWr18PHx8f/bZx48bBzs4Oq1evxoIFC7B371707dsX169f17dZuXIlBgwYAFdXVyxfvhw//vgjPD09MXDgwDuGk0uXLqFPnz44cuQIFi5ciBUrViA/Px8TJ06s1vZejnM3hgwZgt27dwMAHn30UezevRu7d+/Gu+++i0cffRQA9Ot2794Nf39/k9dAVG8JIrJJmZmZAoB44oknatU+JSVFABDjx483WP/PP/8IAOKtt97SrwsKChJjx46tto8+ffqIPn366Je3bdsmAIj77ruvWttly5YJAOLhhx82WL9r1y4BQMyePVsIIURBQYHw9PQUQ4cONWin1WpFu3btRJcuXW7brzfeeEPIZDKRnJxssD4qKkoAENu2bTPJcdLS0gQA8cknn9y2XU3fHQAxYcIEg3UTJkwQ/J9worvHESAiqpVt27YBQLVTW126dEHLli3vaQTkkUceueW20aNHGyz36NEDQUFB+noSEhJw7do1jB07FmVlZfqXTqfDoEGDsG/fPhQUFNxy/9u2bUPr1q3Rrl07g/WjRo0yWL7X4xCRdeEkaCIb5e3tDUdHR6SlpdWq/dWrVwGgxtMsAQEBOHfu3F3XcrtTN35+fjWuq6zn0qVLAKA/JVSTa9euwcnJqcZtV69eRXBw8B2Pe6/HISLrwgBEZKPkcjn69++PP//8ExcuXECjRo1u297LywsAkJGRUa3tv//+C29vb/2yvb09SkpKqu3jypUrBu0q3e6qr8zMzBrXNWvWDAD0+/viiy/QrVu3Gvfh6+t7y/17eXnd8hhV3etxiMi68BQYkQ2bPn06hBB4/vnnUVpaWm27RqPB+vXrAQD3338/gPKJwFXt27cPKSkp6N+/v35dkyZNcOjQIYN2J06cQGpqqtE1rlq1ymA5ISEB586d019N1rNnT7i7u+PYsWPo1KlTjS+VSnXL/ffr1w9Hjx7FwYMHDdavXr3aYPlej2NqlVftFRUVWeyYRPUJR4CIbFj37t2xaNEijB8/HhEREXj55ZfRunVraDQaJCUlYfHixQgPD8fQoUPRvHlzvPDCC/jiiy9gZ2eHwYMH4+zZs3j33XcRGBiIV199Vb/fMWPG4Mknn8T48ePxyCOP4Ny5c5g7dy4aNGhgdI379+/Hc889h8ceewzp6el4++230bBhQ4wfPx4A4OzsjC+++AJjx47FtWvX8Oijj8LHxweXL1/GwYMHcfnyZSxatOiW+4+Ojsa3336LIUOGYPbs2fD19cWqVatw/Phxg3b3epxKhw8fxs8//1xtfefOnREUFFTr76VNmzYAgI8//hiDBw+GXC5H27ZtLRrCiOo0qWdhE5H0kpOTxdixY0Xjxo2FSqUSTk5OokOHDuK9994TWVlZ+nZarVZ8/PHHIiwsTCiVSuHt7S2efPJJkZ6ebrA/nU4n5s6dK0JCQoS9vb3o1KmT2Lp16y2vAvvpp5+q1VR5FdjGjRvFmDFjhLu7u3BwcBAPPPCAOHnyZLX28fHxYsiQIcLT01MolUrRsGFDMWTIkBr3fbNjx46JqKgoYW9vLzw9PcW4cePE//3f/xlcBXavx6m8CuxWr2XLlgkhan8VWElJiXjuuedEgwYNhEwmEwBEWlraHftKROVkQgghTfQiIiIikgbnABEREZHNYQAiIiIim8MARERERDaHAYiIiIhsDgMQERER2RzJA9DFixfx5JNPwsvLC46Ojmjfvj0SExP124UQmDlzJgICAuDg4IC+ffvi6NGjBvsoKSnBpEmT4O3tDScnJwwbNgwXLlywdFeIiIiojpD0RojZ2dno2bMn+vXrhz///BM+Pj44ffo03N3d9W3mzp2LefPmITY2FmFhYZg9ezaioqKQmpoKFxcXAOU3Mlu/fj3WrFkDLy8vTJ06FQ8++CASExMhl8vvWIdOp8O///4LFxeX296Sn4iIiKyHEAJ5eXkICAiAnZ2RYzpS3oTojTfeEL169brldp1OJ/z8/MRHH32kX1dcXCzc3NzE119/LYQQ4vr160KpVIo1a9bo21y8eFHY2dmJuLi4WtWRnp5+2xuU8cUXX3zxxRdf1vu6+WastSHpCNC6deswcOBAPPbYY4iPj9ff3v75558HAKSlpSEzMxMDBgzQf0atVqNPnz5ISEjAiy++iMTERGg0GoM2AQEBCA8PR0JCAgYOHFjtuCUlJQYPahQV94JMT0+Hq6urubpLREREJpSbm4vAwED9GSFjSBqAzpw5g0WLFmHKlCl46623sHfvXkyePBlqtRpPPfWU/mnMNz9h2dfXF+fOnQNQ/sRmlUoFDw+Pam1qesIzAMyZMwfvv/9+tfWurq4MQERERHXM3UxfkXQStE6nQ8eOHRETE4MOHTrgxRdfxPPPP1/tgYI3d0wIccfO3q7N9OnTkZOTo3+lp6ffW0eIiIioTpE0APn7+6NVq1YG61q2bInz588DAPz8/ACg2khOVlaWflTIz88PpaWlyM7OvmWbm6nVav1oD0d9iIiIbI+kAahnz55ITU01WHfixAkEBQUBAIKDg+Hn54dNmzbpt5eWliI+Ph49evQAAERERECpVBq0ycjIwJEjR/RtiIiIiKqSdA7Qq6++ih49eiAmJgYjR47E3r17sXjxYixevBhA+amv6OhoxMTEIDQ0FKGhoYiJiYGjoyNGjRoFAHBzc8O4ceMwdepUeHl5wdPTE6+99hratGmDyMhIKbtHRERVaLVaaDQaqcugOkQul0OhUJjlFjWSBqDOnTtj7dq1mD59OmbNmoXg4GAsWLAAo0eP1reZNm0aioqKMH78eGRnZ6Nr167YuHGjwYzv+fPnQ6FQYOTIkSgqKkL//v0RGxtbq3sAERGR+eXn5+PChQv6q26JasvR0RH+/v5QqVQm3a9M8F8jcnNz4ebmhpycHM4HIiIyMa1Wi5MnT8LR0RENGjTgDWepVoQQKC0
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 640x480 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# utilisation d'une métrique plus adaptée aux modèles de marketing : courbe de lift\n",
|
|||
|
"\n",
|
|||
|
"# Tri des prédictions de probabilités et des vraies valeurs\n",
|
|||
|
"sorted_indices = np.argsort(y_pred_prob)[::-1]\n",
|
|||
|
"y_pred_prob_sorted = y_pred_prob[sorted_indices]\n",
|
|||
|
"y_test_sorted = y_test.iloc[sorted_indices]\n",
|
|||
|
"\n",
|
|||
|
"# Calcul du gain cumulatif\n",
|
|||
|
"cumulative_gain = np.cumsum(y_test_sorted) / np.sum(y_test_sorted)\n",
|
|||
|
"\n",
|
|||
|
"# Tracé de la courbe de lift\n",
|
|||
|
"plt.plot(np.linspace(0, 1, len(cumulative_gain))[:10000], (cumulative_gain/np.linspace(0, 1, len(cumulative_gain)))[:10000], label='Courbe de lift')\n",
|
|||
|
"plt.xlabel('Pourcentage des données')\n",
|
|||
|
"plt.ylabel('Gain cumulatif')\n",
|
|||
|
"plt.title('Courbe de Lift')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 37,
|
|||
|
"id": "49dc4e25-a79e-44d7-a577-524468336b96",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"52512 0.000000\n",
|
|||
|
"87081 0.000000\n",
|
|||
|
"2695 0.000000\n",
|
|||
|
"51486 0.006211\n",
|
|||
|
"15 0.012422\n",
|
|||
|
" ... \n",
|
|||
|
"86959 1.000000\n",
|
|||
|
"86960 1.000000\n",
|
|||
|
"86961 1.000000\n",
|
|||
|
"86962 1.000000\n",
|
|||
|
"65836 1.000000\n",
|
|||
|
"Name: y_has_purchased, Length: 128198, dtype: float64"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"cumulative_gain"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"id": "5fde953b-4cce-4879-bb5e-1852511e7054",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Sauvegarde des résultats (à reprendre))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 40,
|
|||
|
"id": "7ac941bf-7994-4baf-8d9f-13b93eed73a9",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# sauvegarde\n",
|
|||
|
"\n",
|
|||
|
"with open('test_logit.pkl', 'wb') as file:\n",
|
|||
|
" pickle.dump(clf4, file)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 41,
|
|||
|
"id": "3ac3def3-00f2-4b31-b6f7-2cae5038b766",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# pour charger les paramètres \n",
|
|||
|
"\n",
|
|||
|
"# Chargement du modèle à partir du fichier\n",
|
|||
|
"with open('test_logit.pkl', 'rb') as file:\n",
|
|||
|
" loaded_logit = pickle.load(file)"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3 (ipykernel)",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.11.6"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 5
|
|||
|
}
|