generalization #8
173
Sport/Modelization/2_Modelization_sport.ipynb
Normal file
173
Sport/Modelization/2_Modelization_sport.ipynb
Normal file
|
@ -0,0 +1,173 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3415114e-9577-4487-89eb-4931620ad9f0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Predict Sales"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "f271eb45-1470-4764-8c2e-31374efa1fe5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import os\n",
|
||||
"import s3fs\n",
|
||||
"import re\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
|
||||
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"from sklearn.compose import ColumnTransformer\n",
|
||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"from sklearn.impute import SimpleImputer\n",
|
||||
"from sklearn.model_selection import GridSearchCV\n",
|
||||
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
|
||||
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
|
||||
"import seaborn as sns\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
|
||||
"import pickle\n",
|
||||
"#import scikitplot as skplt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ae591854-3003-4c75-a0c7-5abf04246e81",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Load Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "59dd4694-a812-4923-b995-a2ee86c74f85",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create filesystem object\n",
|
||||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
||||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_train_test():\n",
|
||||
" BUCKET = \"projet-bdc2324-team1/Generalization/sport/\"\n",
|
||||
" File_path_train = BUCKET + \"/\" + \"dataset_train.csv\"\n",
|
||||
" File_path_test = BUCKET + \"/\" + \"dataset_train.csv\"\n",
|
||||
" \n",
|
||||
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
||||
" dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
|
||||
"\n",
|
||||
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
||||
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
|
||||
" dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
|
||||
" \n",
|
||||
" return dataset_train, dataset_test"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "825d14a3-6967-4733-bfd4-64bf61c2bd43",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def features_target_split(dataset_train, dataset_test):\n",
|
||||
" X_train = dataset_train[]\n",
|
||||
" y_train = dataset_train['y_has_purchased']\n",
|
||||
"\n",
|
||||
" X_test = dataset_test[]\n",
|
||||
" y_test = dataset_test['y_has_purchased']\n",
|
||||
" return X_train, X_test, y_train, y_test"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a1d6de94-4e11-481a-a0ce-412bf29f692c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prepare preprocessing and Hyperparameters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b32a79ea-907f-4dfc-9832-6c74bef3200c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
|
||||
" 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",
|
||||
"\n",
|
||||
"numeric_transformer = Pipeline(steps=[\n",
|
||||
" # (\"imputer\", SimpleImputer(strategy=\"mean\")), # NaN remplacés par la moyenne, mais peu importe car on a supprimé les valeurs manquantes\n",
|
||||
" (\"scaler\", StandardScaler())])\n",
|
||||
"\n",
|
||||
"preproc = ColumnTransformer(transformers=[(\"num\", numeric_transformer, numeric_features)])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9809a688-bfbc-4685-a77f-17a8b2b79ab3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set loss\n",
|
||||
"\n",
|
||||
"balanced_scorer = make_scorer(balanced_accuracy_score)\n",
|
||||
"f1_scorer = make_scorer(f1_score)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "206d9a95-7c37-4506-949b-e77d225e42c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Hyperparameter\n",
|
||||
"\n",
|
||||
"parameters4 = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n",
|
||||
" 'logreg__class_weight': ['balanced']} "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
Reference in New Issue
Block a user