{ "cells": [ { "cell_type": "markdown", "id": "3415114e-9577-4487-89eb-4931620ad9f0", "metadata": {}, "source": [ "# Predict Sales" ] }, { "cell_type": "code", "execution_count": 201, "id": "f271eb45-1470-4764-8c2e-31374efa1fe5", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n", "from sklearn.utils import class_weight\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n", "\n", "import pickle\n", "import warnings\n", "#import scikitplot as skplt" ] }, { "cell_type": "code", "execution_count": 202, "id": "3fecb606-22e5-4dee-8efa-f8dff0832299", "metadata": {}, "outputs": [], "source": [ "warnings.filterwarnings('ignore')\n", "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", "warnings.filterwarnings(\"ignore\", category=DataConversionWarning)" ] }, { "cell_type": "markdown", "id": "ae591854-3003-4c75-a0c7-5abf04246e81", "metadata": {}, "source": [ "### Load Data" ] }, { "cell_type": "code", "execution_count": 203, "id": "59dd4694-a812-4923-b995-a2ee86c74f85", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 204, "id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3", "metadata": {}, "outputs": [], "source": [ "def load_train_test():\n", " BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n", " File_path_train = BUCKET + \"/Train_set/\" + \"dataset_train5.csv\"\n", " File_path_test = BUCKET + \"/Test_set/\" + \"dataset_test5.csv\"\n", " \n", " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", " dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", "\n", " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n", " dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", " \n", " return dataset_train, dataset_test" ] }, { "cell_type": "code", "execution_count": 205, "id": "825d14a3-6967-4733-bfd4-64bf61c2bd43", "metadata": {}, "outputs": [], "source": [ "def features_target_split(dataset_train, dataset_test):\n", " features_l = ['nb_tickets', 'nb_purchases', 'total_amount',\n", " 'nb_suppliers', 'nb_tickets_internet',\n", " 'opt_in',\n", " 'nb_campaigns', 'nb_campaigns_opened']\n", " X_train = dataset_train[features_l]\n", " y_train = dataset_train[['y_has_purchased']]\n", "\n", " X_test = dataset_test[features_l]\n", " y_test = dataset_test[['y_has_purchased']]\n", " return X_train, X_test, y_train, y_test" ] }, { "cell_type": "code", "execution_count": 206, "id": "c479b230-b4bd-4cfb-b76b-d9faf6d95772", "metadata": {}, "outputs": [], "source": [ "dataset_train, dataset_test = load_train_test()" ] }, { "cell_type": "code", "execution_count": 207, "id": "69eaec12-b30f-4d30-a461-ea520d5cbf77", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)" ] }, { "cell_type": "code", "execution_count": 208, "id": "d039f31d-0093-46c6-9743-ddec1381f758", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape train : (330117, 8)\n", "Shape test : (141480, 8)\n" ] } ], "source": [ "print(\"Shape train : \", X_train.shape)\n", "print(\"Shape test : \", X_test.shape)" ] }, { "cell_type": "markdown", "id": "a1d6de94-4e11-481a-a0ce-412bf29f692c", "metadata": {}, "source": [ "### Prepare preprocessing and Hyperparameters" ] }, { "cell_type": "code", "execution_count": 209, "id": "b808da43-c444-4e94-995a-7ec6ccd01e2d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0.0: 0.5381774965030861, 1.0: 7.048360235716116}" ] }, "execution_count": 209, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Compute Weights\n", "weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n", " y = y_train['y_has_purchased'])\n", "\n", "weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n", "weight_dict" ] }, { "cell_type": "code", "execution_count": 210, "id": "b32a79ea-907f-4dfc-9832-6c74bef3200c", "metadata": {}, "outputs": [], "source": [ "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n", " 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", "numeric_transformer = Pipeline(steps=[\n", " #(\"imputer\", SimpleImputer(strategy=\"mean\")), \n", " (\"scaler\", StandardScaler()) \n", "])\n", "\n", "categorical_features = ['opt_in'] \n", "\n", "# Transformer for the categorical features\n", "categorical_transformer = Pipeline(steps=[\n", " #(\"imputer\", SimpleImputer(strategy=\"most_frequent\")), # Impute missing values with the most frequent\n", " (\"onehot\", OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n", "])\n", "\n", "preproc = ColumnTransformer(\n", " transformers=[\n", " (\"num\", numeric_transformer, numeric_features),\n", " (\"cat\", categorical_transformer, categorical_features)\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 211, "id": "9809a688-bfbc-4685-a77f-17a8b2b79ab3", "metadata": {}, "outputs": [], "source": [ "# Set loss\n", "\n", "balanced_scorer = make_scorer(balanced_accuracy_score)\n", "recall_scorer = make_scorer(recall_score)\n" ] }, { "cell_type": "code", "execution_count": 212, "id": "206d9a95-7c37-4506-949b-e77d225e42c5", "metadata": {}, "outputs": [], "source": [ "# Hyperparameter\n", "\n", "param_grid = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", " 'logreg__penalty': ['l1', 'l2'],\n", " 'logreg__class_weight': ['balanced', weight_dict]} " ] }, { "cell_type": "code", "execution_count": 213, "id": "7ff2f7bd-efc1-4f7c-a3c9-caa916aa2f2b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('preprocessor',\n",
       "                 ColumnTransformer(transformers=[('num',\n",
       "                                                  Pipeline(steps=[('scaler',\n",
       "                                                                   StandardScaler())]),\n",
       "                                                  ['nb_tickets', 'nb_purchases',\n",
       "                                                   'total_amount',\n",
       "                                                   'nb_suppliers',\n",
       "                                                   'nb_tickets_internet',\n",
       "                                                   'nb_campaigns',\n",
       "                                                   'nb_campaigns_opened']),\n",
       "                                                 ('cat',\n",
       "                                                  Pipeline(steps=[('onehot',\n",
       "                                                                   OneHotEncoder(handle_unknown='ignore',\n",
       "                                                                                 sparse_output=False))]),\n",
       "                                                  ['opt_in'])])),\n",
       "                ('logreg',\n",
       "                 LogisticRegression(class_weight={0.0: 0.5381774965030861,\n",
       "                                                  1.0: 7.048360235716116},\n",
       "                                    max_iter=5000, solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in'])])),\n", " ('logreg',\n", " LogisticRegression(class_weight={0.0: 0.5381774965030861,\n", " 1.0: 7.048360235716116},\n", " max_iter=5000, solver='saga'))])" ] }, "execution_count": 213, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Pipeline\n", "\n", "pipeline = Pipeline(steps=[\n", " ('preprocessor', preproc),\n", " ('logreg', LogisticRegression(solver='saga', class_weight = weight_dict,\n", " max_iter=5000)) \n", "])\n", "\n", "pipeline.set_output(transform=\"pandas\")" ] }, { "cell_type": "markdown", "id": "ed415f60-9663-4179-877b-233faf6e1645", "metadata": {}, "source": [ "## Baseline" ] }, { "cell_type": "code", "execution_count": null, "id": "2b467511-2ae5-4a16-a502-397c3460471d", "metadata": {}, "outputs": [], "source": [ "pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "id": "6356e870-0dfc-4e60-9e48-e2de5e7f9f87", "metadata": {}, "outputs": [], "source": [ "y_pred = pipeline.predict(X_test)\n", "\n", "# Calculate the F1 score\n", "acc = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy Score: {acc}\")\n", "\n", "f1 = f1_score(y_test, y_pred)\n", "print(f\"F1 Score: {f1}\")\n", "\n", "recall = recall_score(y_test, y_pred)\n", "print(f\"Recall Score: {recall}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "09387a09-0d53-4c54-baac-f3c2a57a629a", "metadata": {}, "outputs": [], "source": [ "conf_matrix = confusion_matrix(y_test, y_pred)\n", "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n", "plt.xlabel('Predicted')\n", "plt.ylabel('Actual')\n", "plt.title('Confusion Matrix')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "580b58d7-596f-4207-8c99-4365aba2bc9f", "metadata": {}, "outputs": [], "source": [ "y_pred_prob = pipeline.predict_proba(X_test)[:, 1]\n", "\n", "# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)\n", "fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)\n", "\n", "# Calcul de l'aire sous la courbe ROC (AUC)\n", "roc_auc = auc(fpr, tpr)\n", "\n", "plt.figure(figsize = (14, 8))\n", "plt.plot(fpr, tpr, label=\"ROC curve(area = %0.3f)\" % roc_auc)\n", "plt.plot([0, 1], [0, 1], color=\"red\",label=\"Random Baseline\", linestyle=\"--\")\n", "plt.grid(color='gray', linestyle='--', linewidth=0.5)\n", "plt.xlabel('Taux de faux positifs (FPR)')\n", "plt.ylabel('Taux de vrais positifs (TPR)')\n", "plt.title('Courbe ROC : modèle logistique')\n", "plt.legend(loc=\"lower right\")\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "ae8e9bd3-0f6a-4f82-bb4c-470cbdc8d6bb", "metadata": {}, "source": [ "## Cross Validation" ] }, { "cell_type": "code", "execution_count": null, "id": "7f0535de-34f1-4e97-b993-b429ecf0a554", "metadata": {}, "outputs": [], "source": [ "y_train = y_train['y_has_purchased']" ] }, { "cell_type": "code", "execution_count": null, "id": "f7fca463-d7d6-493b-8329-fdfa92457f78", "metadata": {}, "outputs": [], "source": [ "# Cross validation\n", "\n", "grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=f1_scorer, error_score='raise',\n", " n_jobs=-1)\n", "\n", "grid_search.fit(X_train, y_train)\n", "\n", "# Print the best parameters and the best score\n", "print(\"Best parameters found: \", grid_search.best_params_)\n", "print(\"Best cross-validation score: {:.2f}\".format(grid_search.best_score_))\n", "\n", "# Evaluate the best model on the test set\n", "test_score = grid_search.score(X_test, y_test)\n", "print(\"Test set score: {:.2f}\".format(test_score))" ] }, { "cell_type": "code", "execution_count": null, "id": "56bd7828-4de1-4166-bea0-5d5e152b9d38", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }