BDC-team-1/Sport/Modelization/2_Modelization_sport.ipynb

904 lines
39 KiB
Plaintext
Raw Normal View History

2024-03-08 14:48:38 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "3415114e-9577-4487-89eb-4931620ad9f0",
"metadata": {},
"source": [
"# Predict Sales"
]
},
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 201,
2024-03-08 14:48:38 +01:00
"id": "f271eb45-1470-4764-8c2e-31374efa1fe5",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.ensemble import RandomForestClassifier\n",
2024-03-10 12:30:57 +01:00
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n",
2024-03-10 11:09:53 +01:00
"from sklearn.utils import class_weight\n",
2024-03-08 14:48:38 +01:00
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
2024-03-10 11:09:53 +01:00
"from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
"\n",
2024-03-08 14:48:38 +01:00
"import pickle\n",
2024-03-10 11:09:53 +01:00
"import warnings\n",
2024-03-08 14:48:38 +01:00
"#import scikitplot as skplt"
]
},
2024-03-10 11:09:53 +01:00
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 202,
2024-03-10 11:09:53 +01:00
"id": "3fecb606-22e5-4dee-8efa-f8dff0832299",
"metadata": {},
"outputs": [],
"source": [
"warnings.filterwarnings('ignore')\n",
"warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n",
"warnings.filterwarnings(\"ignore\", category=DataConversionWarning)"
]
},
2024-03-08 14:48:38 +01:00
{
"cell_type": "markdown",
"id": "ae591854-3003-4c75-a0c7-5abf04246e81",
"metadata": {},
"source": [
"### Load Data"
]
},
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 203,
2024-03-08 14:48:38 +01:00
"id": "59dd4694-a812-4923-b995-a2ee86c74f85",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 204,
2024-03-08 14:48:38 +01:00
"id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3",
"metadata": {},
"outputs": [],
"source": [
"def load_train_test():\n",
2024-03-10 11:09:53 +01:00
" BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n",
2024-03-10 12:30:57 +01:00
" File_path_train = BUCKET + \"/Train_set/\" + \"dataset_train5.csv\"\n",
" File_path_test = BUCKET + \"/Test_set/\" + \"dataset_test5.csv\"\n",
2024-03-08 14:48:38 +01:00
" \n",
2024-03-10 11:09:53 +01:00
" with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
2024-03-08 14:48:38 +01:00
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
" dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
"\n",
2024-03-10 11:09:53 +01:00
" with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
2024-03-08 14:48:38 +01:00
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
" dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
" \n",
" return dataset_train, dataset_test"
]
},
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 205,
2024-03-08 14:48:38 +01:00
"id": "825d14a3-6967-4733-bfd4-64bf61c2bd43",
"metadata": {},
"outputs": [],
"source": [
"def features_target_split(dataset_train, dataset_test):\n",
2024-03-10 11:09:53 +01:00
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount',\n",
" 'nb_suppliers', 'nb_tickets_internet',\n",
" 'opt_in',\n",
" 'nb_campaigns', 'nb_campaigns_opened']\n",
" X_train = dataset_train[features_l]\n",
" y_train = dataset_train[['y_has_purchased']]\n",
2024-03-08 14:48:38 +01:00
"\n",
2024-03-10 11:09:53 +01:00
" X_test = dataset_test[features_l]\n",
" y_test = dataset_test[['y_has_purchased']]\n",
2024-03-08 14:48:38 +01:00
" return X_train, X_test, y_train, y_test"
]
},
2024-03-10 11:09:53 +01:00
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 206,
2024-03-10 11:09:53 +01:00
"id": "c479b230-b4bd-4cfb-b76b-d9faf6d95772",
"metadata": {},
"outputs": [],
"source": [
"dataset_train, dataset_test = load_train_test()"
]
},
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 207,
2024-03-10 11:09:53 +01:00
"id": "69eaec12-b30f-4d30-a461-ea520d5cbf77",
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)"
]
},
2024-03-10 12:30:57 +01:00
{
"cell_type": "code",
"execution_count": 208,
"id": "d039f31d-0093-46c6-9743-ddec1381f758",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape train : (330117, 8)\n",
"Shape test : (141480, 8)\n"
]
}
],
"source": [
"print(\"Shape train : \", X_train.shape)\n",
"print(\"Shape test : \", X_test.shape)"
]
},
2024-03-08 14:48:38 +01:00
{
"cell_type": "markdown",
"id": "a1d6de94-4e11-481a-a0ce-412bf29f692c",
"metadata": {},
"source": [
"### Prepare preprocessing and Hyperparameters"
]
},
2024-03-10 11:09:53 +01:00
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 209,
2024-03-10 11:09:53 +01:00
"id": "b808da43-c444-4e94-995a-7ec6ccd01e2d",
"metadata": {},
2024-03-10 12:30:57 +01:00
"outputs": [
{
"data": {
"text/plain": [
"{0.0: 0.5381774965030861, 1.0: 7.048360235716116}"
]
},
"execution_count": 209,
"metadata": {},
"output_type": "execute_result"
}
],
2024-03-10 11:09:53 +01:00
"source": [
"# Compute Weights\n",
"weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n",
" y = y_train['y_has_purchased'])\n",
"\n",
"weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n",
"weight_dict"
]
},
2024-03-08 14:48:38 +01:00
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 210,
2024-03-08 14:48:38 +01:00
"id": "b32a79ea-907f-4dfc-9832-6c74bef3200c",
"metadata": {},
"outputs": [],
"source": [
"numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
2024-03-10 11:09:53 +01:00
" 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']\n",
2024-03-08 14:48:38 +01:00
"\n",
"numeric_transformer = Pipeline(steps=[\n",
2024-03-10 11:09:53 +01:00
" #(\"imputer\", SimpleImputer(strategy=\"mean\")), \n",
" (\"scaler\", StandardScaler()) \n",
"])\n",
"\n",
"categorical_features = ['opt_in'] \n",
2024-03-08 14:48:38 +01:00
"\n",
2024-03-10 11:09:53 +01:00
"# Transformer for the categorical features\n",
"categorical_transformer = Pipeline(steps=[\n",
" #(\"imputer\", SimpleImputer(strategy=\"most_frequent\")), # Impute missing values with the most frequent\n",
" (\"onehot\", OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n",
"])\n",
"\n",
"preproc = ColumnTransformer(\n",
" transformers=[\n",
" (\"num\", numeric_transformer, numeric_features),\n",
" (\"cat\", categorical_transformer, categorical_features)\n",
" ]\n",
")"
2024-03-08 14:48:38 +01:00
]
},
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 211,
2024-03-08 14:48:38 +01:00
"id": "9809a688-bfbc-4685-a77f-17a8b2b79ab3",
"metadata": {},
"outputs": [],
"source": [
"# Set loss\n",
"\n",
"balanced_scorer = make_scorer(balanced_accuracy_score)\n",
2024-03-10 12:30:57 +01:00
"recall_scorer = make_scorer(recall_score)\n"
2024-03-08 14:48:38 +01:00
]
},
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 212,
2024-03-08 14:48:38 +01:00
"id": "206d9a95-7c37-4506-949b-e77d225e42c5",
"metadata": {},
"outputs": [],
"source": [
"# Hyperparameter\n",
"\n",
2024-03-10 11:09:53 +01:00
"param_grid = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n",
2024-03-10 12:30:57 +01:00
" 'logreg__penalty': ['l1', 'l2'],\n",
2024-03-10 11:09:53 +01:00
" 'logreg__class_weight': ['balanced', weight_dict]} "
]
},
{
"cell_type": "code",
2024-03-10 12:30:57 +01:00
"execution_count": 213,
2024-03-10 11:09:53 +01:00
"id": "7ff2f7bd-efc1-4f7c-a3c9-caa916aa2f2b",
"metadata": {},
2024-03-10 12:30:57 +01:00
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-27 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-27 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-27 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-27 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-27 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-27 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-27 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-27 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-27 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-27 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-27 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-27 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-27 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-27 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-27 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-27 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-27 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-27 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-27 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-27 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-27 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-27 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-27 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-27 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-27 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-27 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-27\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;])])),\n",
" (&#x27;logreg&#x27;,\n",
" LogisticRegression(class_weight={0.0: 0.5381774965030861,\n",
" 1.0: 7.048360235716116},\n",
" max_iter=5000, solver=&#x27;saga&#x27;))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-184\" type=\"checkbox\" ><label for=\"sk-estimator-id-184\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link \">i<span>Not fitted</span></span></label><div class=\"sk-toggleable__content \"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;,\n",
" StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;,\n",
" &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;,\n",
" &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;,\n",
" &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;])])),\n",
" (&#x27;logreg&#x27;,\n",
" LogisticRegression(class_weight={0.0: 0.5381774965030861,\n",
" 1.0: 7.048360235716116},\n",
" max_iter=5000, solver=&#x27;saga&#x27;))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-185\" type=\"checkbox\" ><label for=\"sk-estimator-id-185\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content \"><pre>ColumnTransformer(transformers=[(&#x27;num&#x27;,\n",
" Pipeline(steps=[(&#x27;scaler&#x27;, StandardScaler())]),\n",
" [&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;,\n",
" &#x27;nb_suppliers&#x27;, &#x27;nb_tickets_internet&#x27;,\n",
" &#x27;nb_campaigns&#x27;, &#x27;nb_campaigns_opened&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" Pipeline(steps=[(&#x27;onehot&#x27;,\n",
" OneHotEncoder(handle_unknown=&#x27;ignore&#x27;,\n",
" sparse_output=False))]),\n",
" [&#x27;opt_in&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-186\" type=\"checkbox\" ><label for=\"sk-estimator-id-186\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">num</label><div class=\"sk-toggleable__content \"><pre>[&#x27;nb_tickets&#x27;, &#x27;nb_purchases&#x27;, &#x27;total_amount&#x27;, &#x27;nb_suppliers&#x27;, &#x27;nb_tickets_internet&#x27;, &#x27;nb_campaigns&#x27;, &#x27;nb_campaigns_opened&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-187\" type=\"checkbox\" ><label for=\"sk-estimator-id-187\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;StandardScaler<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content \"><pre>StandardScaler()</pre></div> </div></div></div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-188\" type=\"checkbox\" ><label for=\"sk-estimator-id-188\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">cat</label><div class=\"sk-toggleable__content \"><pre>[&#x27;opt_in&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-189\" type=\"checkbox\" ><label for=\"sk-estimator-id-189\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;OneHotEncoder<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.preprocessing.OneHotEncoder.html\">?<span>Documentation for OneHotEncoder</span></a></label><div class=\"sk-toggleable__content \"><pre>OneHotEncoder(handle_unknown=&#x27;ignore&#x27;, sparse_output=False)</pre></div> </div></div></div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-190\" type=\"checkbox\" ><label for=\"sk-estimator-id-190\" class=\"sk-toggleable__label sk-toggleable__label-arrow \">&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link \" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.4/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></label><div class=\"sk-toggleable__content \"><pre>LogisticRegression(class_weight={0.0: 0.5381774965030861,\n",
" 1.0: 7.048360235716116},\n",
" max_iter=5000, solver=&#x27;saga&#x27;)</pre></div> </div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets', 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_campaigns_opened']),\n",
" ('cat',\n",
" Pipeline(steps=[('onehot',\n",
" OneHotEncoder(handle_unknown='ignore',\n",
" sparse_output=False))]),\n",
" ['opt_in'])])),\n",
" ('logreg',\n",
" LogisticRegression(class_weight={0.0: 0.5381774965030861,\n",
" 1.0: 7.048360235716116},\n",
" max_iter=5000, solver='saga'))])"
]
},
"execution_count": 213,
"metadata": {},
"output_type": "execute_result"
}
],
2024-03-10 11:09:53 +01:00
"source": [
"# Pipeline\n",
"\n",
"pipeline = Pipeline(steps=[\n",
" ('preprocessor', preproc),\n",
2024-03-10 12:30:57 +01:00
" ('logreg', LogisticRegression(solver='saga', class_weight = weight_dict,\n",
" max_iter=5000)) \n",
2024-03-10 11:09:53 +01:00
"])\n",
"\n",
"pipeline.set_output(transform=\"pandas\")"
2024-03-08 14:48:38 +01:00
]
2024-03-10 11:09:53 +01:00
},
2024-03-10 12:30:57 +01:00
{
"cell_type": "markdown",
"id": "ed415f60-9663-4179-877b-233faf6e1645",
"metadata": {},
"source": [
"## Baseline"
]
},
2024-03-10 11:09:53 +01:00
{
"cell_type": "code",
"execution_count": null,
"id": "2b467511-2ae5-4a16-a502-397c3460471d",
"metadata": {},
"outputs": [],
"source": [
"pipeline.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6356e870-0dfc-4e60-9e48-e2de5e7f9f87",
"metadata": {},
"outputs": [],
"source": [
"y_pred = pipeline.predict(X_test)\n",
"\n",
"# Calculate the F1 score\n",
2024-03-10 12:30:57 +01:00
"acc = accuracy_score(y_test, y_pred)\n",
"print(f\"Accuracy Score: {acc}\")\n",
"\n",
2024-03-10 11:09:53 +01:00
"f1 = f1_score(y_test, y_pred)\n",
2024-03-10 12:30:57 +01:00
"print(f\"F1 Score: {f1}\")\n",
"\n",
"recall = recall_score(y_test, y_pred)\n",
"print(f\"Recall Score: {recall}\")"
2024-03-10 11:09:53 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09387a09-0d53-4c54-baac-f3c2a57a629a",
"metadata": {},
"outputs": [],
"source": [
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('Actual')\n",
"plt.title('Confusion Matrix')\n",
"plt.show()"
]
},
2024-03-10 12:30:57 +01:00
{
"cell_type": "code",
"execution_count": null,
"id": "580b58d7-596f-4207-8c99-4365aba2bc9f",
"metadata": {},
"outputs": [],
"source": [
"y_pred_prob = pipeline.predict_proba(X_test)[:, 1]\n",
"\n",
"# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)\n",
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)\n",
"\n",
"# Calcul de l'aire sous la courbe ROC (AUC)\n",
"roc_auc = auc(fpr, tpr)\n",
"\n",
"plt.figure(figsize = (14, 8))\n",
"plt.plot(fpr, tpr, label=\"ROC curve(area = %0.3f)\" % roc_auc)\n",
"plt.plot([0, 1], [0, 1], color=\"red\",label=\"Random Baseline\", linestyle=\"--\")\n",
"plt.grid(color='gray', linestyle='--', linewidth=0.5)\n",
"plt.xlabel('Taux de faux positifs (FPR)')\n",
"plt.ylabel('Taux de vrais positifs (TPR)')\n",
"plt.title('Courbe ROC : modèle logistique')\n",
"plt.legend(loc=\"lower right\")\n",
"plt.show()"
]
},
2024-03-10 11:09:53 +01:00
{
"cell_type": "markdown",
"id": "ae8e9bd3-0f6a-4f82-bb4c-470cbdc8d6bb",
"metadata": {},
"source": [
"## Cross Validation"
]
},
2024-03-10 12:30:57 +01:00
{
"cell_type": "code",
"execution_count": null,
"id": "7f0535de-34f1-4e97-b993-b429ecf0a554",
"metadata": {},
"outputs": [],
"source": [
"y_train = y_train['y_has_purchased']"
]
},
2024-03-10 11:09:53 +01:00
{
"cell_type": "code",
"execution_count": null,
"id": "f7fca463-d7d6-493b-8329-fdfa92457f78",
"metadata": {},
"outputs": [],
"source": [
"# Cross validation\n",
2024-03-10 12:30:57 +01:00
"\n",
"grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=f1_scorer, error_score='raise',\n",
2024-03-10 11:09:53 +01:00
" n_jobs=-1)\n",
"\n",
"grid_search.fit(X_train, y_train)\n",
"\n",
"# Print the best parameters and the best score\n",
"print(\"Best parameters found: \", grid_search.best_params_)\n",
"print(\"Best cross-validation score: {:.2f}\".format(grid_search.best_score_))\n",
"\n",
"# Evaluate the best model on the test set\n",
"test_score = grid_search.score(X_test, y_test)\n",
"print(\"Test set score: {:.2f}\".format(test_score))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "56bd7828-4de1-4166-bea0-5d5e152b9d38",
"metadata": {},
"outputs": [],
"source": []
2024-03-08 14:48:38 +01:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}