From d3fa9f6870c09737689e0a01b3287bdc9bb9cfae Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Fri, 8 Mar 2024 13:48:38 +0000 Subject: [PATCH] prepare Pipeline --- Sport/Modelization/2_Modelization_sport.ipynb | 173 ++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 Sport/Modelization/2_Modelization_sport.ipynb diff --git a/Sport/Modelization/2_Modelization_sport.ipynb b/Sport/Modelization/2_Modelization_sport.ipynb new file mode 100644 index 0000000..ece151c --- /dev/null +++ b/Sport/Modelization/2_Modelization_sport.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3415114e-9577-4487-89eb-4931620ad9f0", + "metadata": {}, + "source": [ + "# Predict Sales" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f271eb45-1470-4764-8c2e-31374efa1fe5", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import s3fs\n", + "import re\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", + "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", + "import pickle\n", + "#import scikitplot as skplt" + ] + }, + { + "cell_type": "markdown", + "id": "ae591854-3003-4c75-a0c7-5abf04246e81", + "metadata": {}, + "source": [ + "### Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59dd4694-a812-4923-b995-a2ee86c74f85", + "metadata": {}, + "outputs": [], + "source": [ + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3", + "metadata": {}, + "outputs": [], + "source": [ + "def load_train_test():\n", + " BUCKET = \"projet-bdc2324-team1/Generalization/sport/\"\n", + " File_path_train = BUCKET + \"/\" + \"dataset_train.csv\"\n", + " File_path_test = BUCKET + \"/\" + \"dataset_train.csv\"\n", + " \n", + " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " dataset_train = pd.read_csv(file_in, sep=\",\")\n", + " dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", + "\n", + " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " dataset_test = pd.read_csv(file_in, sep=\",\")\n", + " dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", + " \n", + " return dataset_train, dataset_test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "825d14a3-6967-4733-bfd4-64bf61c2bd43", + "metadata": {}, + "outputs": [], + "source": [ + "def features_target_split(dataset_train, dataset_test):\n", + " X_train = dataset_train[]\n", + " y_train = dataset_train['y_has_purchased']\n", + "\n", + " X_test = dataset_test[]\n", + " y_test = dataset_test['y_has_purchased']\n", + " return X_train, X_test, y_train, y_test" + ] + }, + { + "cell_type": "markdown", + "id": "a1d6de94-4e11-481a-a0ce-412bf29f692c", + "metadata": {}, + "source": [ + "### Prepare preprocessing and Hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b32a79ea-907f-4dfc-9832-6c74bef3200c", + "metadata": {}, + "outputs": [], + "source": [ + "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n", + " 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n", + "\n", + "numeric_transformer = Pipeline(steps=[\n", + " # (\"imputer\", SimpleImputer(strategy=\"mean\")), # NaN remplacés par la moyenne, mais peu importe car on a supprimé les valeurs manquantes\n", + " (\"scaler\", StandardScaler())])\n", + "\n", + "preproc = ColumnTransformer(transformers=[(\"num\", numeric_transformer, numeric_features)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9809a688-bfbc-4685-a77f-17a8b2b79ab3", + "metadata": {}, + "outputs": [], + "source": [ + "# Set loss\n", + "\n", + "balanced_scorer = make_scorer(balanced_accuracy_score)\n", + "f1_scorer = make_scorer(f1_score)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "206d9a95-7c37-4506-949b-e77d225e42c5", + "metadata": {}, + "outputs": [], + "source": [ + "# Hyperparameter\n", + "\n", + "parameters4 = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", + " 'logreg__class_weight': ['balanced']} " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}