{ "cells": [ { "cell_type": "markdown", "id": "ff8cc602-e733-4a31-bf46-a31087511fe0", "metadata": {}, "source": [ "# Predict sales - sports companies" ] }, { "cell_type": "markdown", "id": "415e466a-1a71-4150-bff7-2f8904766df4", "metadata": {}, "source": [ "## Importations" ] }, { "cell_type": "code", "execution_count": 1, "id": "b5aaf421-850a-4a86-8e99-2c1f0723bd6c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n", "from sklearn.utils import class_weight\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n", "\n", "import pickle\n", "import warnings" ] }, { "cell_type": "markdown", "id": "c2f44070-451e-4109-9a08-3b80011d610f", "metadata": {}, "source": [ "## Load data " ] }, { "cell_type": "code", "execution_count": 2, "id": "b5f8135f-b6e7-4d6d-b8e1-da185b944aff", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 3, "id": "2668a243-4ff8-40c6-9de2-5c9c07bcf714", "metadata": {}, "outputs": [], "source": [ "def load_train_test():\n", " BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n", " File_path_train = BUCKET + \"/Train_set.csv\"\n", " File_path_test = BUCKET + \"/Test_set.csv\"\n", " \n", " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", " # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", "\n", " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n", " # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", " \n", " return dataset_train, dataset_test" ] }, { "cell_type": "code", "execution_count": 4, "id": "13eba3e1-3ea5-435b-8b05-6d7d5744cbe2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_462/2459610029.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n" ] }, { "data": { "text/plain": [ "customer_id 0\n", "nb_tickets 0\n", "nb_purchases 0\n", "total_amount 0\n", "nb_suppliers 0\n", "vente_internet_max 0\n", "purchase_date_min 0\n", "purchase_date_max 0\n", "time_between_purchase 0\n", "nb_tickets_internet 0\n", "street_id 0\n", "structure_id 222825\n", "mcp_contact_id 70874\n", "fidelity 0\n", "tenant_id 0\n", "is_partner 0\n", "deleted_at 224213\n", "gender 0\n", "is_email_true 0\n", "opt_in 0\n", "last_buying_date 66139\n", "max_price 66139\n", "ticket_sum 0\n", "average_price 66023\n", "average_purchase_delay 66139\n", "average_price_basket 66139\n", "average_ticket_basket 66139\n", "total_price 116\n", "purchase_count 0\n", "first_buying_date 66139\n", "country 23159\n", "gender_label 0\n", "gender_female 0\n", "gender_male 0\n", "gender_other 0\n", "country_fr 23159\n", "nb_campaigns 0\n", "nb_campaigns_opened 0\n", "time_to_open 123159\n", "y_has_purchased 0\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train, dataset_test = load_train_test()\n", "dataset_train.isna().sum()" ] }, { "cell_type": "code", "execution_count": 18, "id": "e46622e7-0fc1-43f8-a7e7-34a5e90068b2", "metadata": {}, "outputs": [], "source": [ "def features_target_split(dataset_train, dataset_test):\n", " \"\"\"\n", " features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n", " 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n", " 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n", " \"\"\"\n", "\n", " # we suppress fidelity, time between purchase, and gender other (colinearity issue)\n", " features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', \n", " 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true', \n", " 'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']\n", " \n", " X_train = dataset_train[features_l]\n", " y_train = dataset_train[['y_has_purchased']]\n", "\n", " X_test = dataset_test[features_l]\n", " y_test = dataset_test[['y_has_purchased']]\n", " return X_train, X_test, y_train, y_test" ] }, { "cell_type": "code", "execution_count": 19, "id": "cec4f386-e643-4bd8-b8cd-8917d2c1b3d0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape train : (224213, 14)\n", "Shape test : (96096, 14)\n" ] } ], "source": [ "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)\n", "print(\"Shape train : \", X_train.shape)\n", "print(\"Shape test : \", X_test.shape)" ] }, { "cell_type": "markdown", "id": "c9e8edbd-7ff6-42f9-a8eb-10d27ca19c8a", "metadata": {}, "source": [ "## Prepare preprocessing and Hyperparameters" ] }, { "cell_type": "code", "execution_count": 20, "id": "639b432a-c39c-4bf8-8ee2-e136d156e0dd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0.0: 0.5837086520288036, 1.0: 3.486549107420539}" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Compute Weights\n", "weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n", " y = y_train['y_has_purchased'])\n", "\n", "weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n", "weight_dict" ] }, { "cell_type": "code", "execution_count": 21, "id": "34644a00-85a5-41c9-98df-41178cb3ac69", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | nb_tickets | \n", "nb_purchases | \n", "total_amount | \n", "nb_suppliers | \n", "vente_internet_max | \n", "purchase_date_min | \n", "purchase_date_max | \n", "nb_tickets_internet | \n", "is_email_true | \n", "opt_in | \n", "gender_female | \n", "gender_male | \n", "nb_campaigns | \n", "nb_campaigns_opened | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2.0 | \n", "1.0 | \n", "60.00 | \n", "1.0 | \n", "0.0 | \n", "355.268981 | \n", "355.268981 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
1 | \n", "8.0 | \n", "3.0 | \n", "140.00 | \n", "1.0 | \n", "0.0 | \n", "373.540289 | \n", "219.262269 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
2 | \n", "2.0 | \n", "1.0 | \n", "50.00 | \n", "1.0 | \n", "0.0 | \n", "5.202442 | \n", "5.202442 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
3 | \n", "3.0 | \n", "1.0 | \n", "90.00 | \n", "1.0 | \n", "0.0 | \n", "5.178958 | \n", "5.178958 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
4 | \n", "2.0 | \n", "1.0 | \n", "78.00 | \n", "1.0 | \n", "0.0 | \n", "5.174039 | \n", "5.174039 | \n", "0.0 | \n", "True | \n", "False | \n", "1 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
224208 | \n", "0.0 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "34.0 | \n", "3.0 | \n", "
224209 | \n", "1.0 | \n", "1.0 | \n", "20.00 | \n", "1.0 | \n", "1.0 | \n", "392.501030 | \n", "392.501030 | \n", "1.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "23.0 | \n", "6.0 | \n", "
224210 | \n", "0.0 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "8.0 | \n", "4.0 | \n", "
224211 | \n", "1.0 | \n", "1.0 | \n", "97.11 | \n", "1.0 | \n", "1.0 | \n", "172.334074 | \n", "172.334074 | \n", "1.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "13.0 | \n", "5.0 | \n", "
224212 | \n", "0.0 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "4.0 | \n", "4.0 | \n", "
224213 rows × 14 columns
\n", "Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in',\n", " 'is_email_true'])])),\n", " ('logreg',\n", " LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in',\n", " 'is_email_true'])])),\n", " ('logreg',\n", " LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga'))])
ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler', StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases', 'total_amount',\n", " 'nb_suppliers', 'vente_internet_max',\n", " 'purchase_date_min', 'purchase_date_max',\n", " 'nb_tickets_internet', 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in', 'is_email_true'])])
['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']
StandardScaler()
['opt_in', 'is_email_true']
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga')
\n", " | nb_tickets | \n", "nb_purchases | \n", "total_amount | \n", "nb_suppliers | \n", "vente_internet_max | \n", "purchase_date_min | \n", "purchase_date_max | \n", "nb_tickets_internet | \n", "is_email_true | \n", "opt_in | \n", "gender_female | \n", "gender_male | \n", "nb_campaigns | \n", "nb_campaigns_opened | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
64466 | \n", "11.0 | \n", "4.0 | \n", "281.4 | \n", "1.0 | \n", "1.0 | \n", "238.330591 | \n", "30.285040 | \n", "11.0 | \n", "True | \n", "False | \n", "1 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "
141327 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "0 | \n", "10.0 | \n", "0.0 | \n", "
59999 | \n", "2.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "350.288926 | \n", "350.288926 | \n", "2.0 | \n", "True | \n", "False | \n", "1 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "
26882 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "False | \n", "1 | \n", "0 | \n", "4.0 | \n", "1.0 | \n", "
62952 | \n", "11.0 | \n", "3.0 | \n", "325.0 | \n", "1.0 | \n", "1.0 | \n", "424.486781 | \n", "237.282262 | \n", "11.0 | \n", "True | \n", "False | \n", "0 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
141318 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "0 | \n", "16.0 | \n", "1.0 | \n", "
113838 | \n", "3.0 | \n", "2.0 | \n", "15.0 | \n", "1.0 | \n", "1.0 | \n", "153.152945 | \n", "90.277099 | \n", "3.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "31.0 | \n", "14.0 | \n", "
184926 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "18.0 | \n", "0.0 | \n", "
14617 | \n", "1.0 | \n", "1.0 | \n", "20.0 | \n", "1.0 | \n", "0.0 | \n", "239.258970 | \n", "239.258970 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
21685 | \n", "4.0 | \n", "1.0 | \n", "88.0 | \n", "1.0 | \n", "0.0 | \n", "240.355162 | \n", "240.355162 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
10000 rows × 14 columns
\n", "\n", " | y_has_purchased | \n", "
---|---|
64466 | \n", "0.0 | \n", "
141327 | \n", "0.0 | \n", "
59999 | \n", "0.0 | \n", "
26882 | \n", "0.0 | \n", "
62952 | \n", "0.0 | \n", "
... | \n", "... | \n", "
141318 | \n", "0.0 | \n", "
113838 | \n", "0.0 | \n", "
184926 | \n", "0.0 | \n", "
14617 | \n", "0.0 | \n", "
21685 | \n", "0.0 | \n", "
10000 rows × 1 columns
\n", "Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internetnb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in',\n", " 'is_email_true'])])),\n", " ('logreg',\n", " LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internetnb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in',\n", " 'is_email_true'])])),\n", " ('logreg',\n", " LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga'))])
ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler', StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases', 'total_amount',\n", " 'nb_suppliers', 'vente_internet_max',\n", " 'purchase_date_min', 'purchase_date_max',\n", " 'nb_tickets_internetnb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in', 'is_email_true'])])
['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internetnb_campaigns', 'nb_campaigns_opened']
StandardScaler()
['opt_in', 'is_email_true']
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga')
\n", " | nb_tickets | \n", "nb_purchases | \n", "total_amount | \n", "nb_suppliers | \n", "vente_internet_max | \n", "purchase_date_min | \n", "purchase_date_max | \n", "nb_tickets_internet | \n", "is_email_true | \n", "opt_in | \n", "gender_female | \n", "gender_male | \n", "nb_campaigns | \n", "nb_campaigns_opened | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
64466 | \n", "11.0 | \n", "4.0 | \n", "281.4 | \n", "1.0 | \n", "1.0 | \n", "238.330591 | \n", "30.285040 | \n", "11.0 | \n", "True | \n", "False | \n", "1 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "
141327 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "0 | \n", "10.0 | \n", "0.0 | \n", "
59999 | \n", "2.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "350.288926 | \n", "350.288926 | \n", "2.0 | \n", "True | \n", "False | \n", "1 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "
26882 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "False | \n", "1 | \n", "0 | \n", "4.0 | \n", "1.0 | \n", "
62952 | \n", "11.0 | \n", "3.0 | \n", "325.0 | \n", "1.0 | \n", "1.0 | \n", "424.486781 | \n", "237.282262 | \n", "11.0 | \n", "True | \n", "False | \n", "0 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
141318 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "0 | \n", "16.0 | \n", "1.0 | \n", "
113838 | \n", "3.0 | \n", "2.0 | \n", "15.0 | \n", "1.0 | \n", "1.0 | \n", "153.152945 | \n", "90.277099 | \n", "3.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "31.0 | \n", "14.0 | \n", "
184926 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "18.0 | \n", "0.0 | \n", "
14617 | \n", "1.0 | \n", "1.0 | \n", "20.0 | \n", "1.0 | \n", "0.0 | \n", "239.258970 | \n", "239.258970 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
21685 | \n", "4.0 | \n", "1.0 | \n", "88.0 | \n", "1.0 | \n", "0.0 | \n", "240.355162 | \n", "240.355162 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
10000 rows × 14 columns
\n", "GridSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets',\n", " 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[(...\n", " 1.0: 3.486549107420539},\n", " max_iter=5000,\n", " solver='saga'))]),\n", " param_grid={'logreg__C': array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n", " 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n", " 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n", " 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n", " 'logreg__penalty': ['l1']},\n", " scoring=make_scorer(f1_score, response_method='predict'))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets',\n", " 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[(...\n", " 1.0: 3.486549107420539},\n", " max_iter=5000,\n", " solver='saga'))]),\n", " param_grid={'logreg__C': array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n", " 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n", " 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n", " 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n", " 'logreg__penalty': ['l1']},\n", " scoring=make_scorer(f1_score, response_method='predict'))
Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in',\n", " 'is_email_true'])])),\n", " ('logreg',\n", " LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga'))])
ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler', StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases', 'total_amount',\n", " 'nb_suppliers', 'vente_internet_max',\n", " 'purchase_date_min', 'purchase_date_max',\n", " 'nb_tickets_internet', 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in', 'is_email_true'])])
['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']
StandardScaler()
['opt_in', 'is_email_true']
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga')
GridSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets',\n", " 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[(...\n", " 1.0: 3.486549107420539},\n", " max_iter=5000,\n", " solver='saga'))]),\n", " param_grid={'logreg__C': array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n", " 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n", " 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n", " 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n", " 'logreg__penalty': ['l1']},\n", " scoring=make_scorer(f1_score, response_method='predict'))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=3,\n", " estimator=Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets',\n", " 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[(...\n", " 1.0: 3.486549107420539},\n", " max_iter=5000,\n", " solver='saga'))]),\n", " param_grid={'logreg__C': array([9.765625e-04, 1.953125e-03, 3.906250e-03, 7.812500e-03,\n", " 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n", " 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n", " 4.000000e+00, 8.000000e+00, 1.600000e+01]),\n", " 'logreg__penalty': ['l1']},\n", " scoring=make_scorer(f1_score, response_method='predict'))
Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in',\n", " 'is_email_true'])])),\n", " ('logreg',\n", " LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga'))])
ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler', StandardScaler())]),\n", " ['nb_tickets', 'nb_purchases', 'total_amount',\n", " 'nb_suppliers', 'vente_internet_max',\n", " 'purchase_date_min', 'purchase_date_max',\n", " 'nb_tickets_internet', 'nb_campaigns',\n", " 'nb_campaigns_opened']),\n", " ('cat',\n", " Pipeline(steps=[('onehot',\n", " OneHotEncoder(handle_unknown='ignore',\n", " sparse_output=False))]),\n", " ['opt_in', 'is_email_true'])])
['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']
StandardScaler()
['opt_in', 'is_email_true']
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
LogisticRegression(class_weight={0.0: 0.5837086520288036,\n", " 1.0: 3.486549107420539},\n", " max_iter=5000, solver='saga')
\n", " | nb_tickets | \n", "nb_purchases | \n", "total_amount | \n", "nb_suppliers | \n", "vente_internet_max | \n", "purchase_date_min | \n", "purchase_date_max | \n", "nb_tickets_internet | \n", "is_email_true | \n", "opt_in | \n", "gender_female | \n", "gender_male | \n", "nb_campaigns | \n", "nb_campaigns_opened | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2.0 | \n", "1.0 | \n", "60.00 | \n", "1.0 | \n", "0.0 | \n", "355.268981 | \n", "355.268981 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
1 | \n", "8.0 | \n", "3.0 | \n", "140.00 | \n", "1.0 | \n", "0.0 | \n", "373.540289 | \n", "219.262269 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
2 | \n", "2.0 | \n", "1.0 | \n", "50.00 | \n", "1.0 | \n", "0.0 | \n", "5.202442 | \n", "5.202442 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
3 | \n", "3.0 | \n", "1.0 | \n", "90.00 | \n", "1.0 | \n", "0.0 | \n", "5.178958 | \n", "5.178958 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "0.0 | \n", "0.0 | \n", "
4 | \n", "2.0 | \n", "1.0 | \n", "78.00 | \n", "1.0 | \n", "0.0 | \n", "5.174039 | \n", "5.174039 | \n", "0.0 | \n", "True | \n", "False | \n", "1 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
224208 | \n", "0.0 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "34.0 | \n", "3.0 | \n", "
224209 | \n", "1.0 | \n", "1.0 | \n", "20.00 | \n", "1.0 | \n", "1.0 | \n", "392.501030 | \n", "392.501030 | \n", "1.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "23.0 | \n", "6.0 | \n", "
224210 | \n", "0.0 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "True | \n", "0 | \n", "1 | \n", "8.0 | \n", "4.0 | \n", "
224211 | \n", "1.0 | \n", "1.0 | \n", "97.11 | \n", "1.0 | \n", "1.0 | \n", "172.334074 | \n", "172.334074 | \n", "1.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "13.0 | \n", "5.0 | \n", "
224212 | \n", "0.0 | \n", "0.0 | \n", "0.00 | \n", "0.0 | \n", "0.0 | \n", "550.000000 | \n", "550.000000 | \n", "0.0 | \n", "True | \n", "False | \n", "0 | \n", "1 | \n", "4.0 | \n", "4.0 | \n", "
224213 rows × 14 columns
\n", "