From bb684633d766907652dc8def6a902e75385acdce Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Fri, 8 Mar 2024 13:26:27 +0000 Subject: [PATCH 1/4] work on stat desc --- Sport/Descriptive_statistics/stat_desc_sport.ipynb | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Sport/Descriptive_statistics/stat_desc_sport.ipynb b/Sport/Descriptive_statistics/stat_desc_sport.ipynb index 0745887..1c74b81 100644 --- a/Sport/Descriptive_statistics/stat_desc_sport.ipynb +++ b/Sport/Descriptive_statistics/stat_desc_sport.ipynb @@ -146,6 +146,17 @@ " " ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "866a137c-7385-4f12-9349-b0202c71dff3", + "metadata": {}, + "outputs": [], + "source": [ + "# Construct dataset concerning only customer after start date\n", + "\n" + ] + }, { "cell_type": "markdown", "id": "62922029-8071-402e-8115-c145a2874a2f", From d3fa9f6870c09737689e0a01b3287bdc9bb9cfae Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Fri, 8 Mar 2024 13:48:38 +0000 Subject: [PATCH 2/4] prepare Pipeline --- Sport/Modelization/2_Modelization_sport.ipynb | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 Sport/Modelization/2_Modelization_sport.ipynb diff --git a/Sport/Modelization/2_Modelization_sport.ipynb b/Sport/Modelization/2_Modelization_sport.ipynb new file mode 100644 index 0000000..ece151c --- /dev/null +++ b/Sport/Modelization/2_Modelization_sport.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3415114e-9577-4487-89eb-4931620ad9f0", + "metadata": {}, + "source": [ + "# Predict Sales" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f271eb45-1470-4764-8c2e-31374efa1fe5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import s3fs\n", + "import re\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", + "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", + "import pickle\n", + "#import scikitplot as skplt" + ] + }, + { + "cell_type": "markdown", + "id": "ae591854-3003-4c75-a0c7-5abf04246e81", + "metadata": {}, + "source": [ + "### Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59dd4694-a812-4923-b995-a2ee86c74f85", + "metadata": {}, + "outputs": [], + "source": [ + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3", + "metadata": {}, + "outputs": [], + "source": [ + "def load_train_test():\n", + " BUCKET = \"projet-bdc2324-team1/Generalization/sport/\"\n", + " File_path_train = BUCKET + \"/\" + \"dataset_train.csv\"\n", + " File_path_test = BUCKET + \"/\" + \"dataset_train.csv\"\n", + " \n", + " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " dataset_train = pd.read_csv(file_in, sep=\",\")\n", + " dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", + "\n", + " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " dataset_test = pd.read_csv(file_in, sep=\",\")\n", + " dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", + " \n", + " return dataset_train, dataset_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825d14a3-6967-4733-bfd4-64bf61c2bd43", + "metadata": {}, + "outputs": [], + "source": [ + "def features_target_split(dataset_train, dataset_test):\n", + " X_train = dataset_train[]\n", + " y_train = dataset_train['y_has_purchased']\n", + "\n", + " X_test = dataset_test[]\n", + " y_test = dataset_test['y_has_purchased']\n", + " return X_train, X_test, y_train, y_test" + ] + }, + { + "cell_type": "markdown", + "id": "a1d6de94-4e11-481a-a0ce-412bf29f692c", + "metadata": {}, + "source": [ + "### Prepare preprocessing and Hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b32a79ea-907f-4dfc-9832-6c74bef3200c", + "metadata": {}, + "outputs": [], + "source": [ + "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n", + " 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n", + "\n", + "numeric_transformer = Pipeline(steps=[\n", + " # (\"imputer\", SimpleImputer(strategy=\"mean\")), # NaN remplacés par la moyenne, mais peu importe car on a supprimé les valeurs manquantes\n", + " (\"scaler\", StandardScaler())])\n", + "\n", + "preproc = ColumnTransformer(transformers=[(\"num\", numeric_transformer, numeric_features)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9809a688-bfbc-4685-a77f-17a8b2b79ab3", + "metadata": {}, + "outputs": [], + "source": [ + "# Set loss\n", + "\n", + "balanced_scorer = make_scorer(balanced_accuracy_score)\n", + "f1_scorer = make_scorer(f1_score)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "206d9a95-7c37-4506-949b-e77d225e42c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Hyperparameter\n", + "\n", + "parameters4 = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", + " 'logreg__class_weight': ['balanced']} " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 58c7cac17f35421186affc8b66da7dbdac14760f Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Sun, 10 Mar 2024 10:09:53 +0000 Subject: [PATCH 3/4] work on pipeline --- Sport/Modelization/2_Modelization_sport.ipynb | 203 ++++++++++++++++-- 1 file changed, 184 insertions(+), 19 deletions(-) diff --git a/Sport/Modelization/2_Modelization_sport.ipynb b/Sport/Modelization/2_Modelization_sport.ipynb index ece151c..a3d0476 100644 --- a/Sport/Modelization/2_Modelization_sport.ipynb +++ b/Sport/Modelization/2_Modelization_sport.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 106, "id": "f271eb45-1470-4764-8c2e-31374efa1fe5", "metadata": {}, "outputs": [], @@ -23,6 +23,7 @@ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "from sklearn.utils import class_weight\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", @@ -34,10 +35,25 @@ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", + "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n", + "\n", "import pickle\n", + "import warnings\n", "#import scikitplot as skplt" ] }, + { + "cell_type": "code", + "execution_count": 107, + "id": "3fecb606-22e5-4dee-8efa-f8dff0832299", + "metadata": {}, + "outputs": [], + "source": [ + "warnings.filterwarnings('ignore')\n", + "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", + "warnings.filterwarnings(\"ignore\", category=DataConversionWarning)" + ] + }, { "cell_type": "markdown", "id": "ae591854-3003-4c75-a0c7-5abf04246e81", @@ -48,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 108, "id": "59dd4694-a812-4923-b995-a2ee86c74f85", "metadata": {}, "outputs": [], @@ -60,21 +76,21 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 109, "id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3", "metadata": {}, "outputs": [], "source": [ "def load_train_test():\n", - " BUCKET = \"projet-bdc2324-team1/Generalization/sport/\"\n", - " File_path_train = BUCKET + \"/\" + \"dataset_train.csv\"\n", - " File_path_test = BUCKET + \"/\" + \"dataset_train.csv\"\n", + " BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n", + " File_path_train = BUCKET + \"/\" + \"Train_set.csv\"\n", + " File_path_test = BUCKET + \"/\" + \"Test_set.csv\"\n", " \n", - " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", " dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", "\n", - " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n", " dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", " \n", @@ -83,20 +99,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 110, "id": "825d14a3-6967-4733-bfd4-64bf61c2bd43", "metadata": {}, "outputs": [], "source": [ "def features_target_split(dataset_train, dataset_test):\n", - " X_train = dataset_train[]\n", - " y_train = dataset_train['y_has_purchased']\n", + " features_l = ['nb_tickets', 'nb_purchases', 'total_amount',\n", + " 'nb_suppliers', 'nb_tickets_internet',\n", + " 'opt_in',\n", + " 'nb_campaigns', 'nb_campaigns_opened']\n", + " X_train = dataset_train[features_l]\n", + " y_train = dataset_train[['y_has_purchased']]\n", "\n", - " X_test = dataset_test[]\n", - " y_test = dataset_test['y_has_purchased']\n", + " X_test = dataset_test[features_l]\n", + " y_test = dataset_test[['y_has_purchased']]\n", " return X_train, X_test, y_train, y_test" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c479b230-b4bd-4cfb-b76b-d9faf6d95772", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_train, dataset_test = load_train_test()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69eaec12-b30f-4d30-a461-ea520d5cbf77", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)" + ] + }, { "cell_type": "markdown", "id": "a1d6de94-4e11-481a-a0ce-412bf29f692c", @@ -105,6 +145,21 @@ "### Prepare preprocessing and Hyperparameters" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b808da43-c444-4e94-995a-7ec6ccd01e2d", + "metadata": {}, + "outputs": [], + "source": [ + "# Compute Weights\n", + "weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n", + " y = y_train['y_has_purchased'])\n", + "\n", + "weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n", + "weight_dict" + ] + }, { "cell_type": "code", "execution_count": null, @@ -113,13 +168,27 @@ "outputs": [], "source": [ "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n", - " 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n", + " 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", "numeric_transformer = Pipeline(steps=[\n", - " # (\"imputer\", SimpleImputer(strategy=\"mean\")), # NaN remplacés par la moyenne, mais peu importe car on a supprimé les valeurs manquantes\n", - " (\"scaler\", StandardScaler())])\n", + " #(\"imputer\", SimpleImputer(strategy=\"mean\")), \n", + " (\"scaler\", StandardScaler()) \n", + "])\n", "\n", - "preproc = ColumnTransformer(transformers=[(\"num\", numeric_transformer, numeric_features)])" + "categorical_features = ['opt_in'] \n", + "\n", + "# Transformer for the categorical features\n", + "categorical_transformer = Pipeline(steps=[\n", + " #(\"imputer\", SimpleImputer(strategy=\"most_frequent\")), # Impute missing values with the most frequent\n", + " (\"onehot\", OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n", + "])\n", + "\n", + "preproc = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", numeric_transformer, numeric_features),\n", + " (\"cat\", categorical_transformer, categorical_features)\n", + " ]\n", + ")" ] }, { @@ -144,9 +213,105 @@ "source": [ "# Hyperparameter\n", "\n", - "parameters4 = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", - " 'logreg__class_weight': ['balanced']} " + "param_grid = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", + " 'logreg__penalty': ['l2', 'L1'],\n", + " 'logreg__class_weight': ['balanced', weight_dict]} " ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ff2f7bd-efc1-4f7c-a3c9-caa916aa2f2b", + "metadata": {}, + "outputs": [], + "source": [ + "# Pipeline\n", + "\n", + "pipeline = Pipeline(steps=[\n", + " ('preprocessor', preproc),\n", + " ('logreg', LogisticRegression(solver='saga', max_iter=1000)) \n", + "])\n", + "\n", + "pipeline.set_output(transform=\"pandas\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b467511-2ae5-4a16-a502-397c3460471d", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6356e870-0dfc-4e60-9e48-e2de5e7f9f87", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = pipeline.predict(X_test)\n", + "\n", + "# Calculate the F1 score\n", + "f1 = f1_score(y_test, y_pred)\n", + "print(f\"F1 Score: {f1}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09387a09-0d53-4c54-baac-f3c2a57a629a", + "metadata": {}, + "outputs": [], + "source": [ + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n", + "plt.xlabel('Predicted')\n", + "plt.ylabel('Actual')\n", + "plt.title('Confusion Matrix')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ae8e9bd3-0f6a-4f82-bb4c-470cbdc8d6bb", + "metadata": {}, + "source": [ + "## Cross Validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7fca463-d7d6-493b-8329-fdfa92457f78", + "metadata": {}, + "outputs": [], + "source": [ + "# Cross validation\n", + "y_train = y_train['y_has_purchased']\n", + "grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer, error_score='raise',\n", + " n_jobs=-1)\n", + "\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "# Print the best parameters and the best score\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "print(\"Best cross-validation score: {:.2f}\".format(grid_search.best_score_))\n", + "\n", + "# Evaluate the best model on the test set\n", + "test_score = grid_search.score(X_test, y_test)\n", + "print(\"Test set score: {:.2f}\".format(test_score))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56bd7828-4de1-4166-bea0-5d5e152b9d38", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From adc1da3e49df519c0e4020d1d04c9f8e6c1ec95e Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Sun, 10 Mar 2024 11:30:57 +0000 Subject: [PATCH 4/4] adjust pipeline --- Sport/Modelization/2_Modelization_sport.ipynb | 611 +++++++++++++++++- 1 file changed, 588 insertions(+), 23 deletions(-) diff --git a/Sport/Modelization/2_Modelization_sport.ipynb b/Sport/Modelization/2_Modelization_sport.ipynb index a3d0476..2922b21 100644 --- a/Sport/Modelization/2_Modelization_sport.ipynb +++ b/Sport/Modelization/2_Modelization_sport.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 201, "id": "f271eb45-1470-4764-8c2e-31374efa1fe5", "metadata": {}, "outputs": [], @@ -22,7 +22,7 @@ "import re\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n", "from sklearn.utils import class_weight\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.pipeline import Pipeline\n", @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 202, "id": "3fecb606-22e5-4dee-8efa-f8dff0832299", "metadata": {}, "outputs": [], @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 203, "id": "59dd4694-a812-4923-b995-a2ee86c74f85", "metadata": {}, "outputs": [], @@ -76,15 +76,15 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 204, "id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3", "metadata": {}, "outputs": [], "source": [ "def load_train_test():\n", " BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n", - " File_path_train = BUCKET + \"/\" + \"Train_set.csv\"\n", - " File_path_test = BUCKET + \"/\" + \"Test_set.csv\"\n", + " File_path_train = BUCKET + \"/Train_set/\" + \"dataset_train5.csv\"\n", + " File_path_test = BUCKET + \"/Test_set/\" + \"dataset_test5.csv\"\n", " \n", " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 205, "id": "825d14a3-6967-4733-bfd4-64bf61c2bd43", "metadata": {}, "outputs": [], @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 206, "id": "c479b230-b4bd-4cfb-b76b-d9faf6d95772", "metadata": {}, "outputs": [], @@ -129,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 207, "id": "69eaec12-b30f-4d30-a461-ea520d5cbf77", "metadata": {}, "outputs": [], @@ -137,6 +137,26 @@ "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)" ] }, + { + "cell_type": "code", + "execution_count": 208, + "id": "d039f31d-0093-46c6-9743-ddec1381f758", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape train : (330117, 8)\n", + "Shape test : (141480, 8)\n" + ] + } + ], + "source": [ + "print(\"Shape train : \", X_train.shape)\n", + "print(\"Shape test : \", X_test.shape)" + ] + }, { "cell_type": "markdown", "id": "a1d6de94-4e11-481a-a0ce-412bf29f692c", @@ -147,10 +167,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 209, "id": "b808da43-c444-4e94-995a-7ec6ccd01e2d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{0.0: 0.5381774965030861, 1.0: 7.048360235716116}" + ] + }, + "execution_count": 209, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Compute Weights\n", "weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n", @@ -162,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 210, "id": "b32a79ea-907f-4dfc-9832-6c74bef3200c", "metadata": {}, "outputs": [], @@ -193,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 211, "id": "9809a688-bfbc-4685-a77f-17a8b2b79ab3", "metadata": {}, "outputs": [], @@ -201,12 +232,12 @@ "# Set loss\n", "\n", "balanced_scorer = make_scorer(balanced_accuracy_score)\n", - "f1_scorer = make_scorer(f1_score)\n" + "recall_scorer = make_scorer(recall_score)\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 212, "id": "206d9a95-7c37-4506-949b-e77d225e42c5", "metadata": {}, "outputs": [], @@ -214,27 +245,519 @@ "# Hyperparameter\n", "\n", "param_grid = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", - " 'logreg__penalty': ['l2', 'L1'],\n", + " 'logreg__penalty': ['l1', 'l2'],\n", " 'logreg__class_weight': ['balanced', weight_dict]} " ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 213, "id": "7ff2f7bd-efc1-4f7c-a3c9-caa916aa2f2b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 ColumnTransformer(transformers=[('num',\n",
+       "                                                  Pipeline(steps=[('scaler',\n",
+       "                                                                   StandardScaler())]),\n",
+       "                                                  ['nb_tickets', 'nb_purchases',\n",
+       "                                                   'total_amount',\n",
+       "                                                   'nb_suppliers',\n",
+       "                                                   'nb_tickets_internet',\n",
+       "                                                   'nb_campaigns',\n",
+       "                                                   'nb_campaigns_opened']),\n",
+       "                                                 ('cat',\n",
+       "                                                  Pipeline(steps=[('onehot',\n",
+       "                                                                   OneHotEncoder(handle_unknown='ignore',\n",
+       "                                                                                 sparse_output=False))]),\n",
+       "                                                  ['opt_in'])])),\n",
+       "                ('logreg',\n",
+       "                 LogisticRegression(class_weight={0.0: 0.5381774965030861,\n",
+       "                                                  1.0: 7.048360235716116},\n",
+       "                                    max_iter=5000, solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('scaler',\n", + " StandardScaler())]),\n", + " ['nb_tickets', 'nb_purchases',\n", + " 'total_amount',\n", + " 'nb_suppliers',\n", + " 'nb_tickets_internet',\n", + " 'nb_campaigns',\n", + " 'nb_campaigns_opened']),\n", + " ('cat',\n", + " Pipeline(steps=[('onehot',\n", + " OneHotEncoder(handle_unknown='ignore',\n", + " sparse_output=False))]),\n", + " ['opt_in'])])),\n", + " ('logreg',\n", + " LogisticRegression(class_weight={0.0: 0.5381774965030861,\n", + " 1.0: 7.048360235716116},\n", + " max_iter=5000, solver='saga'))])" + ] + }, + "execution_count": 213, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Pipeline\n", "\n", "pipeline = Pipeline(steps=[\n", " ('preprocessor', preproc),\n", - " ('logreg', LogisticRegression(solver='saga', max_iter=1000)) \n", + " ('logreg', LogisticRegression(solver='saga', class_weight = weight_dict,\n", + " max_iter=5000)) \n", "])\n", "\n", "pipeline.set_output(transform=\"pandas\")" ] }, + { + "cell_type": "markdown", + "id": "ed415f60-9663-4179-877b-233faf6e1645", + "metadata": {}, + "source": [ + "## Baseline" + ] + }, { "cell_type": "code", "execution_count": null, @@ -255,8 +778,14 @@ "y_pred = pipeline.predict(X_test)\n", "\n", "# Calculate the F1 score\n", + "acc = accuracy_score(y_test, y_pred)\n", + "print(f\"Accuracy Score: {acc}\")\n", + "\n", "f1 = f1_score(y_test, y_pred)\n", - "print(f\"F1 Score: {f1}\")" + "print(f\"F1 Score: {f1}\")\n", + "\n", + "recall = recall_score(y_test, y_pred)\n", + "print(f\"Recall Score: {recall}\")" ] }, { @@ -274,6 +803,32 @@ "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "580b58d7-596f-4207-8c99-4365aba2bc9f", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_prob = pipeline.predict_proba(X_test)[:, 1]\n", + "\n", + "# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)\n", + "fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)\n", + "\n", + "# Calcul de l'aire sous la courbe ROC (AUC)\n", + "roc_auc = auc(fpr, tpr)\n", + "\n", + "plt.figure(figsize = (14, 8))\n", + "plt.plot(fpr, tpr, label=\"ROC curve(area = %0.3f)\" % roc_auc)\n", + "plt.plot([0, 1], [0, 1], color=\"red\",label=\"Random Baseline\", linestyle=\"--\")\n", + "plt.grid(color='gray', linestyle='--', linewidth=0.5)\n", + "plt.xlabel('Taux de faux positifs (FPR)')\n", + "plt.ylabel('Taux de vrais positifs (TPR)')\n", + "plt.title('Courbe ROC : modèle logistique')\n", + "plt.legend(loc=\"lower right\")\n", + "plt.show()" + ] + }, { "cell_type": "markdown", "id": "ae8e9bd3-0f6a-4f82-bb4c-470cbdc8d6bb", @@ -282,6 +837,16 @@ "## Cross Validation" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f0535de-34f1-4e97-b993-b429ecf0a554", + "metadata": {}, + "outputs": [], + "source": [ + "y_train = y_train['y_has_purchased']" + ] + }, { "cell_type": "code", "execution_count": null, @@ -290,8 +855,8 @@ "outputs": [], "source": [ "# Cross validation\n", - "y_train = y_train['y_has_purchased']\n", - "grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer, error_score='raise',\n", + "\n", + "grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=f1_scorer, error_score='raise',\n", " n_jobs=-1)\n", "\n", "grid_search.fit(X_train, y_train)\n",