diff --git a/Sport/Descriptive_statistics/stat_desc_sport.ipynb b/Sport/Descriptive_statistics/stat_desc_sport.ipynb index 0745887..1c74b81 100644 --- a/Sport/Descriptive_statistics/stat_desc_sport.ipynb +++ b/Sport/Descriptive_statistics/stat_desc_sport.ipynb @@ -146,6 +146,17 @@ " " ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "866a137c-7385-4f12-9349-b0202c71dff3", + "metadata": {}, + "outputs": [], + "source": [ + "# Construct dataset concerning only customer after start date\n", + "\n" + ] + }, { "cell_type": "markdown", "id": "62922029-8071-402e-8115-c145a2874a2f", diff --git a/Sport/Modelization/2_Modelization_sport.ipynb b/Sport/Modelization/2_Modelization_sport.ipynb new file mode 100644 index 0000000..2922b21 --- /dev/null +++ b/Sport/Modelization/2_Modelization_sport.ipynb @@ -0,0 +1,903 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3415114e-9577-4487-89eb-4931620ad9f0", + "metadata": {}, + "source": [ + "# Predict Sales" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "f271eb45-1470-4764-8c2e-31374efa1fe5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import s3fs\n", + "import re\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n", + "from sklearn.utils import class_weight\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", + "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", + "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n", + "\n", + "import pickle\n", + "import warnings\n", + "#import scikitplot as skplt" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "3fecb606-22e5-4dee-8efa-f8dff0832299", + "metadata": {}, + "outputs": [], + "source": [ + "warnings.filterwarnings('ignore')\n", + "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", + "warnings.filterwarnings(\"ignore\", category=DataConversionWarning)" + ] + }, + { + "cell_type": "markdown", + "id": "ae591854-3003-4c75-a0c7-5abf04246e81", + "metadata": {}, + "source": [ + "### Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "id": "59dd4694-a812-4923-b995-a2ee86c74f85", + "metadata": {}, + "outputs": [], + "source": [ + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3", + "metadata": {}, + "outputs": [], + "source": [ + "def load_train_test():\n", + " BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n", + " File_path_train = BUCKET + \"/Train_set/\" + \"dataset_train5.csv\"\n", + " File_path_test = BUCKET + \"/Test_set/\" + \"dataset_test5.csv\"\n", + " \n", + " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", + " dataset_train = pd.read_csv(file_in, sep=\",\")\n", + " dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", + "\n", + " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", + " dataset_test = pd.read_csv(file_in, sep=\",\")\n", + " dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", + " \n", + " return dataset_train, dataset_test" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "id": "825d14a3-6967-4733-bfd4-64bf61c2bd43", + "metadata": {}, + "outputs": [], + "source": [ + "def features_target_split(dataset_train, dataset_test):\n", + " features_l = ['nb_tickets', 'nb_purchases', 'total_amount',\n", + " 'nb_suppliers', 'nb_tickets_internet',\n", + " 'opt_in',\n", + " 'nb_campaigns', 'nb_campaigns_opened']\n", + " X_train = dataset_train[features_l]\n", + " y_train = dataset_train[['y_has_purchased']]\n", + "\n", + " X_test = dataset_test[features_l]\n", + " y_test = dataset_test[['y_has_purchased']]\n", + " return X_train, X_test, y_train, y_test" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "id": "c479b230-b4bd-4cfb-b76b-d9faf6d95772", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_train, dataset_test = load_train_test()" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "id": "69eaec12-b30f-4d30-a461-ea520d5cbf77", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "id": "d039f31d-0093-46c6-9743-ddec1381f758", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape train : (330117, 8)\n", + "Shape test : (141480, 8)\n" + ] + } + ], + "source": [ + "print(\"Shape train : \", X_train.shape)\n", + "print(\"Shape test : \", X_test.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "a1d6de94-4e11-481a-a0ce-412bf29f692c", + "metadata": {}, + "source": [ + "### Prepare preprocessing and Hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "b808da43-c444-4e94-995a-7ec6ccd01e2d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0.0: 0.5381774965030861, 1.0: 7.048360235716116}" + ] + }, + "execution_count": 209, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Compute Weights\n", + "weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n", + " y = y_train['y_has_purchased'])\n", + "\n", + "weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n", + "weight_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "b32a79ea-907f-4dfc-9832-6c74bef3200c", + "metadata": {}, + "outputs": [], + "source": [ + "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n", + " 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']\n", + "\n", + "numeric_transformer = Pipeline(steps=[\n", + " #(\"imputer\", SimpleImputer(strategy=\"mean\")), \n", + " (\"scaler\", StandardScaler()) \n", + "])\n", + "\n", + "categorical_features = ['opt_in'] \n", + "\n", + "# Transformer for the categorical features\n", + "categorical_transformer = Pipeline(steps=[\n", + " #(\"imputer\", SimpleImputer(strategy=\"most_frequent\")), # Impute missing values with the most frequent\n", + " (\"onehot\", OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n", + "])\n", + "\n", + "preproc = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", numeric_transformer, numeric_features),\n", + " (\"cat\", categorical_transformer, categorical_features)\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "id": "9809a688-bfbc-4685-a77f-17a8b2b79ab3", + "metadata": {}, + "outputs": [], + "source": [ + "# Set loss\n", + "\n", + "balanced_scorer = make_scorer(balanced_accuracy_score)\n", + "recall_scorer = make_scorer(recall_score)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "id": "206d9a95-7c37-4506-949b-e77d225e42c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Hyperparameter\n", + "\n", + "param_grid = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", + " 'logreg__penalty': ['l1', 'l2'],\n", + " 'logreg__class_weight': ['balanced', weight_dict]} " + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "id": "7ff2f7bd-efc1-4f7c-a3c9-caa916aa2f2b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('scaler',\n", + " StandardScaler())]),\n", + " ['nb_tickets', 'nb_purchases',\n", + " 'total_amount',\n", + " 'nb_suppliers',\n", + " 'nb_tickets_internet',\n", + " 'nb_campaigns',\n", + " 'nb_campaigns_opened']),\n", + " ('cat',\n", + " Pipeline(steps=[('onehot',\n", + " OneHotEncoder(handle_unknown='ignore',\n", + " sparse_output=False))]),\n", + " ['opt_in'])])),\n", + " ('logreg',\n", + " LogisticRegression(class_weight={0.0: 0.5381774965030861,\n", + " 1.0: 7.048360235716116},\n", + " max_iter=5000, solver='saga'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('scaler',\n", + " StandardScaler())]),\n", + " ['nb_tickets', 'nb_purchases',\n", + " 'total_amount',\n", + " 'nb_suppliers',\n", + " 'nb_tickets_internet',\n", + " 'nb_campaigns',\n", + " 'nb_campaigns_opened']),\n", + " ('cat',\n", + " Pipeline(steps=[('onehot',\n", + " OneHotEncoder(handle_unknown='ignore',\n", + " sparse_output=False))]),\n", + " ['opt_in'])])),\n", + " ('logreg',\n", + " LogisticRegression(class_weight={0.0: 0.5381774965030861,\n", + " 1.0: 7.048360235716116},\n", + " max_iter=5000, solver='saga'))])
ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('scaler', StandardScaler())]),\n", + " ['nb_tickets', 'nb_purchases', 'total_amount',\n", + " 'nb_suppliers', 'nb_tickets_internet',\n", + " 'nb_campaigns', 'nb_campaigns_opened']),\n", + " ('cat',\n", + " Pipeline(steps=[('onehot',\n", + " OneHotEncoder(handle_unknown='ignore',\n", + " sparse_output=False))]),\n", + " ['opt_in'])])
['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']
StandardScaler()
['opt_in']
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
LogisticRegression(class_weight={0.0: 0.5381774965030861,\n", + " 1.0: 7.048360235716116},\n", + " max_iter=5000, solver='saga')