2867 lines
336 KiB
Plaintext
2867 lines
336 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "56949d8f-4eaf-4685-9989-ba0b4b1945b7",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Baseline logit on spectacle companies with statmodels"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "eae443dc-6c28-401a-a30e-e02f5f4da2df",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Importation des packages et des données"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "72480e84-2ccc-481a-9353-1199e4358d62",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"import os\n",
|
||
"import s3fs\n",
|
||
"import re\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n",
|
||
"from sklearn.utils import class_weight\n",
|
||
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.impute import SimpleImputer\n",
|
||
"from sklearn.model_selection import GridSearchCV\n",
|
||
"from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
|
||
"from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
|
||
"import seaborn as sns\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
|
||
"from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
|
||
"\n",
|
||
"import statsmodels.api as sm\n",
|
||
"\n",
|
||
"import pickle\n",
|
||
"import warnings"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "7090dc21-7889-4776-a0a4-f7c6a5416d53",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Create filesystem object\n",
|
||
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
||
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def load_train_test():\n",
|
||
" BUCKET = \"projet-bdc2324-team1/Generalization/musique\"\n",
|
||
" File_path_train = BUCKET + \"/Train_set.csv\"\n",
|
||
" File_path_test = BUCKET + \"/Test_set.csv\"\n",
|
||
" \n",
|
||
" with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
|
||
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
||
" # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
|
||
"\n",
|
||
" with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
|
||
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
|
||
" # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
|
||
" \n",
|
||
" return dataset_train, dataset_test"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "438d0138-a254-464c-9e94-f7436576c1d5",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def features_target_split(dataset_train, dataset_test):\n",
|
||
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n",
|
||
" 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n",
|
||
" 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n",
|
||
" X_train = dataset_train[features_l]\n",
|
||
" y_train = dataset_train[['y_has_purchased']]\n",
|
||
"\n",
|
||
" X_test = dataset_test[features_l]\n",
|
||
" y_test = dataset_test[['y_has_purchased']]\n",
|
||
" return X_train, X_test, y_train, y_test"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "ebe9a887-61a4-4a5e-ac64-231307dd7647",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
||
"/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"dataset_train, dataset_test = load_train_test()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "b21fdea2-02c4-4222-b4e0-635e423f91c2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"customer_id 0\n",
|
||
"nb_tickets 0\n",
|
||
"nb_purchases 0\n",
|
||
"total_amount 0\n",
|
||
"nb_suppliers 0\n",
|
||
"vente_internet_max 0\n",
|
||
"purchase_date_min 0\n",
|
||
"purchase_date_max 0\n",
|
||
"time_between_purchase 0\n",
|
||
"nb_tickets_internet 0\n",
|
||
"street_id 0\n",
|
||
"structure_id 327067\n",
|
||
"mcp_contact_id 135224\n",
|
||
"fidelity 0\n",
|
||
"tenant_id 0\n",
|
||
"is_partner 0\n",
|
||
"deleted_at 354365\n",
|
||
"gender 0\n",
|
||
"is_email_true 0\n",
|
||
"opt_in 0\n",
|
||
"last_buying_date 119201\n",
|
||
"max_price 119201\n",
|
||
"ticket_sum 0\n",
|
||
"average_price 115193\n",
|
||
"average_purchase_delay 119203\n",
|
||
"average_price_basket 119203\n",
|
||
"average_ticket_basket 119203\n",
|
||
"total_price 4008\n",
|
||
"purchase_count 0\n",
|
||
"first_buying_date 119201\n",
|
||
"country 56856\n",
|
||
"gender_label 0\n",
|
||
"gender_female 0\n",
|
||
"gender_male 0\n",
|
||
"gender_other 0\n",
|
||
"country_fr 56856\n",
|
||
"nb_campaigns 0\n",
|
||
"nb_campaigns_opened 0\n",
|
||
"time_to_open 224310\n",
|
||
"y_has_purchased 0\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"dataset_train.isna().sum()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "42c4d034-8bc1-4ebb-a1ff-60c0a86f8f7c",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "94b4498d-6ae8-4c96-adbc-7ba1b8348160",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Shape train : (354365, 17)\n",
|
||
"Shape test : (151874, 17)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"Shape train : \", X_train.shape)\n",
|
||
"print(\"Shape test : \", X_test.shape)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "29206597-bce8-41e0-9b68-9b9a2843787a",
|
||
"metadata": {},
|
||
"source": [
|
||
"## optionnel : calcul des poids\n",
|
||
"On pourrait utiliser les poids pour gérer le déséquilibre de classe, mais dans une optique exploratoire, c'est pas indispensable et ça a pas été utilisé ici !"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "6224fd31-c190-4168-b395-e0bf5806d79d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
|
||
]
|
||
},
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Compute Weights\n",
|
||
"weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n",
|
||
" y = y_train['y_has_purchased'])\n",
|
||
"\n",
|
||
"weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n",
|
||
"weight_dict"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "4680f202-979e-483f-89b8-9df877203bcf",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([0.54812838, 0.54812838, 0.54812838, ..., 5.69443998, 0.54812838,\n",
|
||
" 0.54812838])"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Calcul des poids inverses à la fréquence des classes\n",
|
||
"class_counts = np.bincount(y_train['y_has_purchased'])\n",
|
||
"class_weights = len(y_train['y_has_purchased']) / (2 * class_counts)\n",
|
||
"\n",
|
||
"# Sélection des poids correspondants à chaque observation\n",
|
||
"weights = class_weights[y_train['y_has_purchased'].values.astype(int)]\n",
|
||
"weights"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[354365. 354365. 354365. ... 354365. 354365. 354365.]\n",
|
||
"354365\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# verif\n",
|
||
"print(2 * weights * class_counts[y_train['y_has_purchased'].values.astype(int)])\n",
|
||
"print(len(y_train['y_has_purchased']))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "bd1f7d9d-1aff-49e4-81ca-038f732b1595",
|
||
"metadata": {},
|
||
"source": [
|
||
"## définition des variables X et y"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>nb_tickets</th>\n",
|
||
" <th>nb_purchases</th>\n",
|
||
" <th>total_amount</th>\n",
|
||
" <th>nb_suppliers</th>\n",
|
||
" <th>vente_internet_max</th>\n",
|
||
" <th>purchase_date_min</th>\n",
|
||
" <th>purchase_date_max</th>\n",
|
||
" <th>time_between_purchase</th>\n",
|
||
" <th>nb_tickets_internet</th>\n",
|
||
" <th>fidelity</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>opt_in</th>\n",
|
||
" <th>gender_female</th>\n",
|
||
" <th>gender_male</th>\n",
|
||
" <th>gender_other</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10.0</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354360</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354361</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>11.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354362</th>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>50.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>91.030556</td>\n",
|
||
" <td>91.020139</td>\n",
|
||
" <td>0.010417</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" <td>6.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354363</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>55.0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>52.284028</td>\n",
|
||
" <td>52.284028</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354364</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>-1.000000</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>354365 rows × 17 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
|
||
"0 0.0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 0.0 \n",
|
||
"3 0.0 0.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... \n",
|
||
"354360 0.0 0.0 0.0 0.0 \n",
|
||
"354361 0.0 0.0 0.0 0.0 \n",
|
||
"354362 2.0 2.0 50.0 1.0 \n",
|
||
"354363 1.0 1.0 55.0 1.0 \n",
|
||
"354364 0.0 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" vente_internet_max purchase_date_min purchase_date_max \\\n",
|
||
"0 0.0 550.000000 550.000000 \n",
|
||
"1 0.0 550.000000 550.000000 \n",
|
||
"2 0.0 550.000000 550.000000 \n",
|
||
"3 0.0 550.000000 550.000000 \n",
|
||
"4 0.0 550.000000 550.000000 \n",
|
||
"... ... ... ... \n",
|
||
"354360 0.0 550.000000 550.000000 \n",
|
||
"354361 0.0 550.000000 550.000000 \n",
|
||
"354362 0.0 91.030556 91.020139 \n",
|
||
"354363 0.0 52.284028 52.284028 \n",
|
||
"354364 0.0 550.000000 550.000000 \n",
|
||
"\n",
|
||
" time_between_purchase nb_tickets_internet fidelity is_email_true \\\n",
|
||
"0 -1.000000 0.0 1 True \n",
|
||
"1 -1.000000 0.0 0 True \n",
|
||
"2 -1.000000 0.0 1 True \n",
|
||
"3 -1.000000 0.0 0 True \n",
|
||
"4 -1.000000 0.0 0 True \n",
|
||
"... ... ... ... ... \n",
|
||
"354360 -1.000000 0.0 0 True \n",
|
||
"354361 -1.000000 0.0 0 True \n",
|
||
"354362 0.010417 0.0 4 True \n",
|
||
"354363 0.000000 0.0 1 True \n",
|
||
"354364 -1.000000 0.0 0 True \n",
|
||
"\n",
|
||
" opt_in gender_female gender_male gender_other nb_campaigns \\\n",
|
||
"0 True 1 0 0 13.0 \n",
|
||
"1 True 0 0 1 10.0 \n",
|
||
"2 True 0 1 0 14.0 \n",
|
||
"3 False 0 0 1 9.0 \n",
|
||
"4 False 0 0 1 4.0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"354360 False 0 0 1 7.0 \n",
|
||
"354361 True 0 1 0 11.0 \n",
|
||
"354362 False 1 0 0 6.0 \n",
|
||
"354363 True 0 1 0 3.0 \n",
|
||
"354364 False 0 1 0 7.0 \n",
|
||
"\n",
|
||
" nb_campaigns_opened \n",
|
||
"0 4.0 \n",
|
||
"1 9.0 \n",
|
||
"2 0.0 \n",
|
||
"3 0.0 \n",
|
||
"4 0.0 \n",
|
||
"... ... \n",
|
||
"354360 0.0 \n",
|
||
"354361 2.0 \n",
|
||
"354362 6.0 \n",
|
||
"354363 0.0 \n",
|
||
"354364 0.0 \n",
|
||
"\n",
|
||
"[354365 rows x 17 columns]"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# visu de X_train\n",
|
||
"X_train"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "648fb542-0186-493d-b274-be2c26a11967",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# model logit\n",
|
||
"X = X_train.astype(int)\n",
|
||
"# X = sm.add_constant(X.drop(\"gender_other\", axis=1))\n",
|
||
"y = y_train['y_has_purchased'].values\n",
|
||
"\n",
|
||
"# print(X,y)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>nb_tickets</th>\n",
|
||
" <th>nb_purchases</th>\n",
|
||
" <th>total_amount</th>\n",
|
||
" <th>nb_suppliers</th>\n",
|
||
" <th>vente_internet_max</th>\n",
|
||
" <th>purchase_date_min</th>\n",
|
||
" <th>purchase_date_max</th>\n",
|
||
" <th>time_between_purchase</th>\n",
|
||
" <th>nb_tickets_internet</th>\n",
|
||
" <th>fidelity</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>opt_in</th>\n",
|
||
" <th>gender_female</th>\n",
|
||
" <th>gender_male</th>\n",
|
||
" <th>gender_other</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>9</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354360</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354361</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354362</th>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>50</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>91</td>\n",
|
||
" <td>91</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354363</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>55</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>52</td>\n",
|
||
" <td>52</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354364</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>-1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>354365 rows × 17 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
|
||
"0 0 0 0 0 \n",
|
||
"1 0 0 0 0 \n",
|
||
"2 0 0 0 0 \n",
|
||
"3 0 0 0 0 \n",
|
||
"4 0 0 0 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"354360 0 0 0 0 \n",
|
||
"354361 0 0 0 0 \n",
|
||
"354362 2 2 50 1 \n",
|
||
"354363 1 1 55 1 \n",
|
||
"354364 0 0 0 0 \n",
|
||
"\n",
|
||
" vente_internet_max purchase_date_min purchase_date_max \\\n",
|
||
"0 0 550 550 \n",
|
||
"1 0 550 550 \n",
|
||
"2 0 550 550 \n",
|
||
"3 0 550 550 \n",
|
||
"4 0 550 550 \n",
|
||
"... ... ... ... \n",
|
||
"354360 0 550 550 \n",
|
||
"354361 0 550 550 \n",
|
||
"354362 0 91 91 \n",
|
||
"354363 0 52 52 \n",
|
||
"354364 0 550 550 \n",
|
||
"\n",
|
||
" time_between_purchase nb_tickets_internet fidelity is_email_true \\\n",
|
||
"0 -1 0 1 1 \n",
|
||
"1 -1 0 0 1 \n",
|
||
"2 -1 0 1 1 \n",
|
||
"3 -1 0 0 1 \n",
|
||
"4 -1 0 0 1 \n",
|
||
"... ... ... ... ... \n",
|
||
"354360 -1 0 0 1 \n",
|
||
"354361 -1 0 0 1 \n",
|
||
"354362 0 0 4 1 \n",
|
||
"354363 0 0 1 1 \n",
|
||
"354364 -1 0 0 1 \n",
|
||
"\n",
|
||
" opt_in gender_female gender_male gender_other nb_campaigns \\\n",
|
||
"0 1 1 0 0 13 \n",
|
||
"1 1 0 0 1 10 \n",
|
||
"2 1 0 1 0 14 \n",
|
||
"3 0 0 0 1 9 \n",
|
||
"4 0 0 0 1 4 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"354360 0 0 0 1 7 \n",
|
||
"354361 1 0 1 0 11 \n",
|
||
"354362 0 1 0 0 6 \n",
|
||
"354363 1 0 1 0 3 \n",
|
||
"354364 0 0 1 0 7 \n",
|
||
"\n",
|
||
" nb_campaigns_opened \n",
|
||
"0 4 \n",
|
||
"1 9 \n",
|
||
"2 0 \n",
|
||
"3 0 \n",
|
||
"4 0 \n",
|
||
"... ... \n",
|
||
"354360 0 \n",
|
||
"354361 2 \n",
|
||
"354362 6 \n",
|
||
"354363 0 \n",
|
||
"354364 0 \n",
|
||
"\n",
|
||
"[354365 rows x 17 columns]"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 138,
|
||
"id": "81b38ceb-5005-417d-a9a6-b2dac181a8fb",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>purchase_date_min</th>\n",
|
||
" <th>purchase_date_max</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>354365.000000</td>\n",
|
||
" <td>354365.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>406.981861</td>\n",
|
||
" <td>396.551502</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>189.343612</td>\n",
|
||
" <td>195.881681</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>0.009640</td>\n",
|
||
" <td>0.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>188.475293</td>\n",
|
||
" <td>153.457966</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>550.000000</td>\n",
|
||
" <td>550.000000</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" purchase_date_min purchase_date_max\n",
|
||
"count 354365.000000 354365.000000\n",
|
||
"mean 406.981861 396.551502\n",
|
||
"std 189.343612 195.881681\n",
|
||
"min 0.009640 0.000000\n",
|
||
"25% 188.475293 153.457966\n",
|
||
"50% 550.000000 550.000000\n",
|
||
"75% 550.000000 550.000000\n",
|
||
"max 550.000000 550.000000"
|
||
]
|
||
},
|
||
"execution_count": 138,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X_train[[\"purchase_date_min\", \"purchase_date_max\"]].describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 143,
|
||
"id": "60effd66-2914-4cf9-aa0c-4e2f9dd13895",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"count 354365.000000\n",
|
||
"mean 10.430360\n",
|
||
"std 56.442718\n",
|
||
"min 0.000000\n",
|
||
"25% 0.000000\n",
|
||
"50% 0.000000\n",
|
||
"75% 0.000000\n",
|
||
"max 547.443350\n",
|
||
"dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 143,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"(X_train[\"purchase_date_min\"] - X_train[\"purchase_date_max\"]).describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 145,
|
||
"id": "7a99e480-9e11-448d-806e-3b71925a19db",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>nb_tickets</th>\n",
|
||
" <th>nb_purchases</th>\n",
|
||
" <th>total_amount</th>\n",
|
||
" <th>nb_suppliers</th>\n",
|
||
" <th>vente_internet_max</th>\n",
|
||
" <th>purchase_date_min</th>\n",
|
||
" <th>purchase_date_max</th>\n",
|
||
" <th>time_between_purchase</th>\n",
|
||
" <th>nb_tickets_internet</th>\n",
|
||
" <th>fidelity</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>opt_in</th>\n",
|
||
" <th>gender_female</th>\n",
|
||
" <th>gender_male</th>\n",
|
||
" <th>gender_other</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>13.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>10.0</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>14.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>9.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354358</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354359</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>12.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354360</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354361</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>11.0</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354364</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>550.0</td>\n",
|
||
" <td>-1.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>7.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>179675 rows × 17 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
|
||
"0 0.0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 0.0 \n",
|
||
"3 0.0 0.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 0.0 \n",
|
||
"... ... ... ... ... \n",
|
||
"354358 0.0 0.0 0.0 0.0 \n",
|
||
"354359 0.0 0.0 0.0 0.0 \n",
|
||
"354360 0.0 0.0 0.0 0.0 \n",
|
||
"354361 0.0 0.0 0.0 0.0 \n",
|
||
"354364 0.0 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
" vente_internet_max purchase_date_min purchase_date_max \\\n",
|
||
"0 0.0 550.0 550.0 \n",
|
||
"1 0.0 550.0 550.0 \n",
|
||
"2 0.0 550.0 550.0 \n",
|
||
"3 0.0 550.0 550.0 \n",
|
||
"4 0.0 550.0 550.0 \n",
|
||
"... ... ... ... \n",
|
||
"354358 0.0 550.0 550.0 \n",
|
||
"354359 0.0 550.0 550.0 \n",
|
||
"354360 0.0 550.0 550.0 \n",
|
||
"354361 0.0 550.0 550.0 \n",
|
||
"354364 0.0 550.0 550.0 \n",
|
||
"\n",
|
||
" time_between_purchase nb_tickets_internet fidelity is_email_true \\\n",
|
||
"0 -1.0 0.0 1 True \n",
|
||
"1 -1.0 0.0 0 True \n",
|
||
"2 -1.0 0.0 1 True \n",
|
||
"3 -1.0 0.0 0 True \n",
|
||
"4 -1.0 0.0 0 True \n",
|
||
"... ... ... ... ... \n",
|
||
"354358 -1.0 0.0 0 True \n",
|
||
"354359 -1.0 0.0 0 True \n",
|
||
"354360 -1.0 0.0 0 True \n",
|
||
"354361 -1.0 0.0 0 True \n",
|
||
"354364 -1.0 0.0 0 True \n",
|
||
"\n",
|
||
" opt_in gender_female gender_male gender_other nb_campaigns \\\n",
|
||
"0 True 1 0 0 13.0 \n",
|
||
"1 True 0 0 1 10.0 \n",
|
||
"2 True 0 1 0 14.0 \n",
|
||
"3 False 0 0 1 9.0 \n",
|
||
"4 False 0 0 1 4.0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"354358 False 1 0 0 1.0 \n",
|
||
"354359 True 0 1 0 12.0 \n",
|
||
"354360 False 0 0 1 7.0 \n",
|
||
"354361 True 0 1 0 11.0 \n",
|
||
"354364 False 0 1 0 7.0 \n",
|
||
"\n",
|
||
" nb_campaigns_opened \n",
|
||
"0 4.0 \n",
|
||
"1 9.0 \n",
|
||
"2 0.0 \n",
|
||
"3 0.0 \n",
|
||
"4 0.0 \n",
|
||
"... ... \n",
|
||
"354358 0.0 \n",
|
||
"354359 2.0 \n",
|
||
"354360 0.0 \n",
|
||
"354361 2.0 \n",
|
||
"354364 0.0 \n",
|
||
"\n",
|
||
"[179675 rows x 17 columns]"
|
||
]
|
||
},
|
||
"execution_count": 145,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X_train[X_train[\"time_between_purchase\"]==-1]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "a022e8c3-93e7-4530-85a4-da8812d82737",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Prétraitement des données + modèle\n",
|
||
"\n",
|
||
"- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
|
||
"- ajouter un intercept\n",
|
||
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n",
|
||
"\n",
|
||
"#### A recopier dans la pipeline -> section 2 bis"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>const</th>\n",
|
||
" <th>nb_tickets</th>\n",
|
||
" <th>nb_purchases</th>\n",
|
||
" <th>total_amount</th>\n",
|
||
" <th>nb_suppliers</th>\n",
|
||
" <th>vente_internet_max</th>\n",
|
||
" <th>purchase_date_min</th>\n",
|
||
" <th>purchase_date_max</th>\n",
|
||
" <th>nb_tickets_internet</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>opt_in</th>\n",
|
||
" <th>gender_female</th>\n",
|
||
" <th>gender_male</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>13</td>\n",
|
||
" <td>4</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>9</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>14</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>9</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354360</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354361</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>2</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354362</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>2</td>\n",
|
||
" <td>50</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>91</td>\n",
|
||
" <td>91</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>6</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354363</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>55</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>52</td>\n",
|
||
" <td>52</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354364</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>550</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>354365 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" const nb_tickets nb_purchases total_amount nb_suppliers \\\n",
|
||
"0 1.0 0 0 0 0 \n",
|
||
"1 1.0 0 0 0 0 \n",
|
||
"2 1.0 0 0 0 0 \n",
|
||
"3 1.0 0 0 0 0 \n",
|
||
"4 1.0 0 0 0 0 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"354360 1.0 0 0 0 0 \n",
|
||
"354361 1.0 0 0 0 0 \n",
|
||
"354362 1.0 2 2 50 1 \n",
|
||
"354363 1.0 1 1 55 1 \n",
|
||
"354364 1.0 0 0 0 0 \n",
|
||
"\n",
|
||
" vente_internet_max purchase_date_min purchase_date_max \\\n",
|
||
"0 0 550 550 \n",
|
||
"1 0 550 550 \n",
|
||
"2 0 550 550 \n",
|
||
"3 0 550 550 \n",
|
||
"4 0 550 550 \n",
|
||
"... ... ... ... \n",
|
||
"354360 0 550 550 \n",
|
||
"354361 0 550 550 \n",
|
||
"354362 0 91 91 \n",
|
||
"354363 0 52 52 \n",
|
||
"354364 0 550 550 \n",
|
||
"\n",
|
||
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
|
||
"0 0 1 1 1 \n",
|
||
"1 0 1 1 0 \n",
|
||
"2 0 1 1 0 \n",
|
||
"3 0 1 0 0 \n",
|
||
"4 0 1 0 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"354360 0 1 0 0 \n",
|
||
"354361 0 1 1 0 \n",
|
||
"354362 0 1 0 1 \n",
|
||
"354363 0 1 1 0 \n",
|
||
"354364 0 1 0 0 \n",
|
||
"\n",
|
||
" gender_male nb_campaigns nb_campaigns_opened \n",
|
||
"0 0 13 4 \n",
|
||
"1 0 10 9 \n",
|
||
"2 1 14 0 \n",
|
||
"3 0 9 0 \n",
|
||
"4 0 4 0 \n",
|
||
"... ... ... ... \n",
|
||
"354360 0 7 0 \n",
|
||
"354361 1 11 2 \n",
|
||
"354362 0 6 6 \n",
|
||
"354363 1 3 0 \n",
|
||
"354364 1 7 0 \n",
|
||
"\n",
|
||
"[354365 rows x 15 columns]"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# 0. on retire les variables citées ci-dessus et on ajoute l'intercept\n",
|
||
"\n",
|
||
"X = sm.add_constant(X.drop([\"fidelity\", \"time_between_purchase\", \"gender_other\"], axis=1))\n",
|
||
"X"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Optimization terminated successfully.\n",
|
||
" Current function value: 0.234602\n",
|
||
" Iterations 8\n",
|
||
" Logit Regression Results \n",
|
||
"==============================================================================\n",
|
||
"Dep. Variable: y No. Observations: 354365\n",
|
||
"Model: Logit Df Residuals: 354350\n",
|
||
"Method: MLE Df Model: 14\n",
|
||
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||
"Time: 07:57:46 Log-Likelihood: -83135.\n",
|
||
"converged: True LL-Null: -1.0540e+05\n",
|
||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||
"=======================================================================================\n",
|
||
" coef std err z P>|z| [0.025 0.975]\n",
|
||
"---------------------------------------------------------------------------------------\n",
|
||
"const -1.9633 0.093 -21.101 0.000 -2.146 -1.781\n",
|
||
"nb_tickets -0.0003 0.000 -2.191 0.028 -0.001 -2.85e-05\n",
|
||
"nb_purchases -0.0037 0.001 -3.609 0.000 -0.006 -0.002\n",
|
||
"total_amount 6.267e-05 1.63e-05 3.841 0.000 3.07e-05 9.46e-05\n",
|
||
"nb_suppliers 0.3368 0.019 17.662 0.000 0.299 0.374\n",
|
||
"vente_internet_max -1.9874 0.024 -82.965 0.000 -2.034 -1.940\n",
|
||
"purchase_date_min 0.0031 7.77e-05 39.936 0.000 0.003 0.003\n",
|
||
"purchase_date_max -0.0072 8.08e-05 -89.592 0.000 -0.007 -0.007\n",
|
||
"nb_tickets_internet 0.0938 0.004 22.652 0.000 0.086 0.102\n",
|
||
"is_email_true 0.8651 0.088 9.797 0.000 0.692 1.038\n",
|
||
"opt_in -1.9976 0.019 -107.305 0.000 -2.034 -1.961\n",
|
||
"gender_female 0.7032 0.024 29.395 0.000 0.656 0.750\n",
|
||
"gender_male 0.8071 0.024 33.201 0.000 0.759 0.855\n",
|
||
"nb_campaigns 0.0287 0.001 30.633 0.000 0.027 0.031\n",
|
||
"nb_campaigns_opened 0.0486 0.002 28.245 0.000 0.045 0.052\n",
|
||
"=======================================================================================\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 1. Premier modèle de régression logistique sans standardisation (permet une interprétation des coeffs)\n",
|
||
"\n",
|
||
"model_logit = sm.Logit(y, X)\n",
|
||
"\n",
|
||
"# Ajustement du modèle aux données\n",
|
||
"result = model_logit.fit()\n",
|
||
"\n",
|
||
"# Affichage des résultats - toutes les var sont significatives avec des p-valeurs de 0, et de 0.28 pour nbre tickets\n",
|
||
"print(result.summary())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 2. Modèle logit avec données standardisées\n",
|
||
"\n",
|
||
"# Colonnes à standardiser\n",
|
||
"\n",
|
||
"\n",
|
||
"var_num = ['nb_tickets', 'nb_purchases', \"total_amount\", \"nb_suppliers\", \"vente_internet_max\",\n",
|
||
" \"purchase_date_min\", \"purchase_date_max\", \"nb_tickets_internet\",\n",
|
||
" \"nb_campaigns\", \"nb_campaigns_opened\"]\n",
|
||
"\n",
|
||
"# Standardisation des colonnes sélectionnées\n",
|
||
"scaler = StandardScaler()\n",
|
||
"X[var_num] = scaler.fit_transform(X[var_num])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>const</th>\n",
|
||
" <th>nb_tickets</th>\n",
|
||
" <th>nb_purchases</th>\n",
|
||
" <th>total_amount</th>\n",
|
||
" <th>nb_suppliers</th>\n",
|
||
" <th>vente_internet_max</th>\n",
|
||
" <th>purchase_date_min</th>\n",
|
||
" <th>purchase_date_max</th>\n",
|
||
" <th>nb_tickets_internet</th>\n",
|
||
" <th>is_email_true</th>\n",
|
||
" <th>opt_in</th>\n",
|
||
" <th>gender_female</th>\n",
|
||
" <th>gender_male</th>\n",
|
||
" <th>nb_campaigns</th>\n",
|
||
" <th>nb_campaigns_opened</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.024425</td>\n",
|
||
" <td>-0.050722</td>\n",
|
||
" <td>-0.048383</td>\n",
|
||
" <td>-0.768294</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>0.755994</td>\n",
|
||
" <td>0.783940</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.607945</td>\n",
|
||
" <td>0.522567</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.024425</td>\n",
|
||
" <td>-0.050722</td>\n",
|
||
" <td>-0.048383</td>\n",
|
||
" <td>-0.768294</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>0.755994</td>\n",
|
||
" <td>0.783940</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.306155</td>\n",
|
||
" <td>1.701843</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.024425</td>\n",
|
||
" <td>-0.050722</td>\n",
|
||
" <td>-0.048383</td>\n",
|
||
" <td>-0.768294</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>0.755994</td>\n",
|
||
" <td>0.783940</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0.708542</td>\n",
|
||
" <td>-0.420854</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.024425</td>\n",
|
||
" <td>-0.050722</td>\n",
|
||
" <td>-0.048383</td>\n",
|
||
" <td>-0.768294</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>0.755994</td>\n",
|
||
" <td>0.783940</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.205558</td>\n",
|
||
" <td>-0.420854</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.024425</td>\n",
|
||
" <td>-0.050722</td>\n",
|
||
" <td>-0.048383</td>\n",
|
||
" <td>-0.768294</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>0.755994</td>\n",
|
||
" <td>0.783940</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>-0.297426</td>\n",
|
||
" <td>-0.420854</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354360</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.024425</td>\n",
|
||
" <td>-0.050722</td>\n",
|
||
" <td>-0.048383</td>\n",
|
||
" <td>-0.768294</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>0.755994</td>\n",
|
||
" <td>0.783940</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0.004365</td>\n",
|
||
" <td>-0.420854</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354361</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.024425</td>\n",
|
||
" <td>-0.050722</td>\n",
|
||
" <td>-0.048383</td>\n",
|
||
" <td>-0.768294</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>0.755994</td>\n",
|
||
" <td>0.783940</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0.406752</td>\n",
|
||
" <td>0.050856</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354362</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.000838</td>\n",
|
||
" <td>0.092966</td>\n",
|
||
" <td>-0.009150</td>\n",
|
||
" <td>1.219633</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>-1.665887</td>\n",
|
||
" <td>-1.557073</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>-0.096232</td>\n",
|
||
" <td>0.994277</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354363</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.012631</td>\n",
|
||
" <td>0.021122</td>\n",
|
||
" <td>-0.005227</td>\n",
|
||
" <td>1.219633</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>-1.871668</td>\n",
|
||
" <td>-1.755983</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>-0.398023</td>\n",
|
||
" <td>-0.420854</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>354364</th>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>-0.024425</td>\n",
|
||
" <td>-0.050722</td>\n",
|
||
" <td>-0.048383</td>\n",
|
||
" <td>-0.768294</td>\n",
|
||
" <td>-0.599511</td>\n",
|
||
" <td>0.755994</td>\n",
|
||
" <td>0.783940</td>\n",
|
||
" <td>-0.264693</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0.004365</td>\n",
|
||
" <td>-0.420854</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>354365 rows × 15 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" const nb_tickets nb_purchases total_amount nb_suppliers \\\n",
|
||
"0 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n",
|
||
"1 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n",
|
||
"2 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n",
|
||
"3 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n",
|
||
"4 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"354360 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n",
|
||
"354361 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n",
|
||
"354362 1.0 -0.000838 0.092966 -0.009150 1.219633 \n",
|
||
"354363 1.0 -0.012631 0.021122 -0.005227 1.219633 \n",
|
||
"354364 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n",
|
||
"\n",
|
||
" vente_internet_max purchase_date_min purchase_date_max \\\n",
|
||
"0 -0.599511 0.755994 0.783940 \n",
|
||
"1 -0.599511 0.755994 0.783940 \n",
|
||
"2 -0.599511 0.755994 0.783940 \n",
|
||
"3 -0.599511 0.755994 0.783940 \n",
|
||
"4 -0.599511 0.755994 0.783940 \n",
|
||
"... ... ... ... \n",
|
||
"354360 -0.599511 0.755994 0.783940 \n",
|
||
"354361 -0.599511 0.755994 0.783940 \n",
|
||
"354362 -0.599511 -1.665887 -1.557073 \n",
|
||
"354363 -0.599511 -1.871668 -1.755983 \n",
|
||
"354364 -0.599511 0.755994 0.783940 \n",
|
||
"\n",
|
||
" nb_tickets_internet is_email_true opt_in gender_female \\\n",
|
||
"0 -0.264693 1 1 1 \n",
|
||
"1 -0.264693 1 1 0 \n",
|
||
"2 -0.264693 1 1 0 \n",
|
||
"3 -0.264693 1 0 0 \n",
|
||
"4 -0.264693 1 0 0 \n",
|
||
"... ... ... ... ... \n",
|
||
"354360 -0.264693 1 0 0 \n",
|
||
"354361 -0.264693 1 1 0 \n",
|
||
"354362 -0.264693 1 0 1 \n",
|
||
"354363 -0.264693 1 1 0 \n",
|
||
"354364 -0.264693 1 0 0 \n",
|
||
"\n",
|
||
" gender_male nb_campaigns nb_campaigns_opened \n",
|
||
"0 0 0.607945 0.522567 \n",
|
||
"1 0 0.306155 1.701843 \n",
|
||
"2 1 0.708542 -0.420854 \n",
|
||
"3 0 0.205558 -0.420854 \n",
|
||
"4 0 -0.297426 -0.420854 \n",
|
||
"... ... ... ... \n",
|
||
"354360 0 0.004365 -0.420854 \n",
|
||
"354361 1 0.406752 0.050856 \n",
|
||
"354362 0 -0.096232 0.994277 \n",
|
||
"354363 1 -0.398023 -0.420854 \n",
|
||
"354364 1 0.004365 -0.420854 \n",
|
||
"\n",
|
||
"[354365 rows x 15 columns]"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Optimization terminated successfully.\n",
|
||
" Current function value: 0.234602\n",
|
||
" Iterations 8\n",
|
||
" Logit Regression Results \n",
|
||
"==============================================================================\n",
|
||
"Dep. Variable: y No. Observations: 354365\n",
|
||
"Model: Logit Df Residuals: 354350\n",
|
||
"Method: MLE Df Model: 14\n",
|
||
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||
"Time: 07:58:13 Log-Likelihood: -83135.\n",
|
||
"converged: True LL-Null: -1.0540e+05\n",
|
||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||
"=======================================================================================\n",
|
||
" coef std err z P>|z| [0.025 0.975]\n",
|
||
"---------------------------------------------------------------------------------------\n",
|
||
"const -3.6025 0.091 -39.755 0.000 -3.780 -3.425\n",
|
||
"nb_tickets -0.0230 0.010 -2.191 0.028 -0.044 -0.002\n",
|
||
"nb_purchases -0.0519 0.014 -3.609 0.000 -0.080 -0.024\n",
|
||
"total_amount 0.0799 0.021 3.841 0.000 0.039 0.121\n",
|
||
"nb_suppliers 0.1694 0.010 17.662 0.000 0.151 0.188\n",
|
||
"vente_internet_max -0.8764 0.011 -82.965 0.000 -0.897 -0.856\n",
|
||
"purchase_date_min 0.5881 0.015 39.936 0.000 0.559 0.617\n",
|
||
"purchase_date_max -1.4197 0.016 -89.592 0.000 -1.451 -1.389\n",
|
||
"nb_tickets_internet 0.2895 0.013 22.652 0.000 0.264 0.315\n",
|
||
"is_email_true 0.8651 0.088 9.797 0.000 0.692 1.038\n",
|
||
"opt_in -1.9976 0.019 -107.305 0.000 -2.034 -1.961\n",
|
||
"gender_female 0.7032 0.024 29.395 0.000 0.656 0.750\n",
|
||
"gender_male 0.8071 0.024 33.201 0.000 0.759 0.855\n",
|
||
"nb_campaigns 0.2850 0.009 30.633 0.000 0.267 0.303\n",
|
||
"nb_campaigns_opened 0.2061 0.007 28.245 0.000 0.192 0.220\n",
|
||
"=======================================================================================\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 2. modele avec var standardisées (permet de mieux jauger l'importance réelle de chaque variable)\n",
|
||
"\n",
|
||
"model_logit = sm.Logit(y, X)\n",
|
||
"# model_logit = sm.Logit(y, X)\n",
|
||
"\n",
|
||
"# Ajustement du modèle aux données\n",
|
||
"result = model_logit.fit()\n",
|
||
"\n",
|
||
"# Affichage des résultats\n",
|
||
"print(result.summary())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 48,
|
||
"id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Optimization terminated successfully (Exit mode 0)\n",
|
||
" Current function value: 0.23562928627877766\n",
|
||
" Iterations: 240\n",
|
||
" Function evaluations: 243\n",
|
||
" Gradient evaluations: 240\n",
|
||
"const 0.000000e+00\n",
|
||
"nb_tickets 2.477006e-01\n",
|
||
"nb_purchases 1.636902e-03\n",
|
||
"total_amount 8.839088e-04\n",
|
||
"nb_suppliers 1.906550e-65\n",
|
||
"vente_internet_max 0.000000e+00\n",
|
||
"purchase_date_min 0.000000e+00\n",
|
||
"purchase_date_max 0.000000e+00\n",
|
||
"nb_tickets_internet 7.232680e-112\n",
|
||
"is_email_true 8.202187e-08\n",
|
||
"opt_in 0.000000e+00\n",
|
||
"gender_female 1.624424e-170\n",
|
||
"gender_male 4.961315e-220\n",
|
||
"nb_campaigns 6.276733e-205\n",
|
||
"nb_campaigns_opened 2.228531e-176\n",
|
||
"dtype: float64\n",
|
||
" Logit Regression Results \n",
|
||
"==============================================================================\n",
|
||
"Dep. Variable: y No. Observations: 354365\n",
|
||
"Model: Logit Df Residuals: 354350\n",
|
||
"Method: MLE Df Model: 14\n",
|
||
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2111\n",
|
||
"Time: 10:45:37 Log-Likelihood: -83152.\n",
|
||
"converged: True LL-Null: -1.0540e+05\n",
|
||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||
"=======================================================================================\n",
|
||
" coef std err z P>|z| [0.025 0.975]\n",
|
||
"---------------------------------------------------------------------------------------\n",
|
||
"const -3.1162 0.081 -38.383 0.000 -3.275 -2.957\n",
|
||
"nb_tickets -0.0136 0.012 -1.156 0.248 -0.037 0.009\n",
|
||
"nb_purchases -0.0385 0.012 -3.149 0.002 -0.063 -0.015\n",
|
||
"total_amount 0.0588 0.018 3.325 0.001 0.024 0.094\n",
|
||
"nb_suppliers 0.1638 0.010 17.085 0.000 0.145 0.183\n",
|
||
"vente_internet_max -0.8651 0.011 -82.182 0.000 -0.886 -0.844\n",
|
||
"purchase_date_min 0.5790 0.015 39.391 0.000 0.550 0.608\n",
|
||
"purchase_date_max -1.4088 0.016 -89.101 0.000 -1.440 -1.378\n",
|
||
"nb_tickets_internet 0.2857 0.013 22.475 0.000 0.261 0.311\n",
|
||
"is_email_true 0.4224 0.079 5.363 0.000 0.268 0.577\n",
|
||
"opt_in -1.9818 0.019 -106.856 0.000 -2.018 -1.945\n",
|
||
"gender_female 0.6553 0.024 27.835 0.000 0.609 0.701\n",
|
||
"gender_male 0.7578 0.024 31.663 0.000 0.711 0.805\n",
|
||
"nb_campaigns 0.2835 0.009 30.547 0.000 0.265 0.302\n",
|
||
"nb_campaigns_opened 0.2061 0.007 28.315 0.000 0.192 0.220\n",
|
||
"=======================================================================================\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 2.bis on fait de même pour un modèle logit avec pénalité \n",
|
||
"# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
|
||
"\n",
|
||
"# sans spécification, le alpha optimal est déterminé par cross validation\n",
|
||
"# remplacer alpha=32 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
|
||
"# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
|
||
"\n",
|
||
"result = model_logit.fit_regularized(method='l1', alpha = 32)\n",
|
||
"\n",
|
||
"print(result.pvalues)\n",
|
||
"print(result.summary())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "8c3dec50-7b9d-40f6-83b6-6cae26962cf8",
|
||
"metadata": {},
|
||
"source": [
|
||
"### Other method : take into account the weigths ! Pb : with this method, no penalty allowed"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 247,
|
||
"id": "2e3ca381-54e3-445b-bb37-d7ce953cb856",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# define a function to generate summaries of logit model\n",
|
||
"\n",
|
||
"def model_logit(X, y, weight_dict, add_constant=False) :\n",
|
||
" # Generate sample weights based on class weights computed earlier\n",
|
||
" sample_weights = np.array([weight_dict[class_] for class_ in y])\n",
|
||
"\n",
|
||
" if add_constant :\n",
|
||
" X_const = sm.add_constant(X)\n",
|
||
" else :\n",
|
||
" X_const = X\n",
|
||
" \n",
|
||
" # Use GLM from statsmodels with Binomial family for logistic regression\n",
|
||
" model = sm.GLM(y, X_const, family=sm.families.Binomial(), freq_weights=sample_weights)\n",
|
||
" \n",
|
||
" # fit without penalty\n",
|
||
" result = model.fit()\n",
|
||
"\n",
|
||
" result_summary = result.summary()\n",
|
||
" \n",
|
||
" return result_summary"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 248,
|
||
"id": "4cd424a0-7c55-47ff-840e-1354e8dcf863",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Generalized Linear Model Regression Results \n",
|
||
"==============================================================================\n",
|
||
"Dep. Variable: y No. Observations: 354365\n",
|
||
"Model: GLM Df Residuals: 354350\n",
|
||
"Model Family: Binomial Df Model: 14\n",
|
||
"Link Function: Logit Scale: 1.0000\n",
|
||
"Method: IRLS Log-Likelihood: -1.8693e+05\n",
|
||
"Date: Thu, 21 Mar 2024 Deviance: 3.7387e+05\n",
|
||
"Time: 13:19:33 Pearson chi2: 1.97e+16\n",
|
||
"No. Iterations: 100 Pseudo R-squ. (CS): 0.2820\n",
|
||
"Covariance Type: nonrobust \n",
|
||
"=======================================================================================\n",
|
||
" coef std err z P>|z| [0.025 0.975]\n",
|
||
"---------------------------------------------------------------------------------------\n",
|
||
"const -1.3943 0.062 -22.456 0.000 -1.516 -1.273\n",
|
||
"nb_tickets -0.3312 0.016 -20.967 0.000 -0.362 -0.300\n",
|
||
"nb_purchases 0.9258 0.098 9.491 0.000 0.735 1.117\n",
|
||
"total_amount 0.8922 0.042 21.393 0.000 0.810 0.974\n",
|
||
"nb_suppliers 0.2238 0.007 32.137 0.000 0.210 0.237\n",
|
||
"vente_internet_max -0.7453 0.007 -100.473 0.000 -0.760 -0.731\n",
|
||
"purchase_date_min 0.7123 0.015 46.063 0.000 0.682 0.743\n",
|
||
"purchase_date_max -1.3328 0.017 -79.297 0.000 -1.366 -1.300\n",
|
||
"nb_tickets_internet 0.1784 0.011 16.366 0.000 0.157 0.200\n",
|
||
"is_email_true 0.8635 0.061 14.086 0.000 0.743 0.984\n",
|
||
"opt_in -1.7487 0.010 -174.737 0.000 -1.768 -1.729\n",
|
||
"gender_female 0.8084 0.013 60.803 0.000 0.782 0.835\n",
|
||
"gender_male 0.8731 0.014 64.332 0.000 0.846 0.900\n",
|
||
"nb_campaigns 0.1751 0.006 31.101 0.000 0.164 0.186\n",
|
||
"nb_campaigns_opened 0.2962 0.005 54.145 0.000 0.285 0.307\n",
|
||
"=======================================================================================\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# with the function\n",
|
||
"\n",
|
||
"# 1. logit with weights\n",
|
||
"results_logit_weight = model_logit(X,y,weight_dict=weight_dict)\n",
|
||
"print(results_logit_weight)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 252,
|
||
"id": "84dd6242-a9c3-4dee-a58b-abc5f1c6f8fa",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Generalized Linear Model Regression Results \n",
|
||
"==============================================================================\n",
|
||
"Dep. Variable: y No. Observations: 354365\n",
|
||
"Model: GLM Df Residuals: 354350\n",
|
||
"Model Family: Binomial Df Model: 14\n",
|
||
"Link Function: Logit Scale: 1.0000\n",
|
||
"Method: IRLS Log-Likelihood: -83141.\n",
|
||
"Date: Thu, 21 Mar 2024 Deviance: 1.6628e+05\n",
|
||
"Time: 13:20:06 Pearson chi2: 4.52e+15\n",
|
||
"No. Iterations: 8 Pseudo R-squ. (CS): 0.1180\n",
|
||
"Covariance Type: nonrobust \n",
|
||
"=======================================================================================\n",
|
||
" coef std err z P>|z| [0.025 0.975]\n",
|
||
"---------------------------------------------------------------------------------------\n",
|
||
"const -3.6025 0.091 -39.755 0.000 -3.780 -3.425\n",
|
||
"nb_tickets -0.0230 0.010 -2.191 0.028 -0.044 -0.002\n",
|
||
"nb_purchases -0.0519 0.014 -3.609 0.000 -0.080 -0.024\n",
|
||
"total_amount 0.0799 0.021 3.841 0.000 0.039 0.121\n",
|
||
"nb_suppliers 0.1694 0.010 17.662 0.000 0.151 0.188\n",
|
||
"vente_internet_max -0.8764 0.011 -82.965 0.000 -0.897 -0.856\n",
|
||
"purchase_date_min 0.5881 0.015 39.936 0.000 0.559 0.617\n",
|
||
"purchase_date_max -1.4197 0.016 -89.592 0.000 -1.451 -1.389\n",
|
||
"nb_tickets_internet 0.2895 0.013 22.652 0.000 0.264 0.315\n",
|
||
"is_email_true 0.8651 0.088 9.797 0.000 0.692 1.038\n",
|
||
"opt_in -1.9976 0.019 -107.305 0.000 -2.034 -1.961\n",
|
||
"gender_female 0.7032 0.024 29.395 0.000 0.656 0.750\n",
|
||
"gender_male 0.8071 0.024 33.201 0.000 0.759 0.855\n",
|
||
"nb_campaigns 0.2850 0.009 30.633 0.000 0.267 0.303\n",
|
||
"nb_campaigns_opened 0.2061 0.007 28.245 0.000 0.192 0.220\n",
|
||
"=======================================================================================\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# 2. logit without weights\n",
|
||
"\n",
|
||
"results_logit = model_logit(X.drop(\"const\", axis=1),y,weight_dict={0:1, 1:1}, add_constant=True)\n",
|
||
"print(results_logit)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "36c5e770-72b3-4482-ad61-45b511a11f06",
|
||
"metadata": {},
|
||
"source": [
|
||
"## graphique LASSO - quelles variables sont importantes dans le modèle ? "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 313,
|
||
"id": "af208fdf-b4c2-4acd-b29e-c5b67bec3a4d",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"results for solver lbfgs\n",
|
||
"intercept : -3.617357317895187\n",
|
||
"coefficients : [[-0.03114285 -0.06607353 0.10099873 0.16977395 -0.87625108 0.58870838\n",
|
||
" -1.42022841 0.28837776 0.87461022 -2.00037064 0.70874574 0.8136523\n",
|
||
" 0.2850802 0.20640785]]\n",
|
||
"\n",
|
||
"\n",
|
||
"results for solver newton-cg\n",
|
||
"intercept : -3.5774790840156467\n",
|
||
"coefficients : [[-0.0224498 -0.05092757 0.07842438 0.16941048 -0.87645255 0.58801191\n",
|
||
" -1.41953483 0.28961165 0.84037075 -1.99757163 0.70302619 0.8068438\n",
|
||
" 0.2849652 0.20613618]]\n",
|
||
"\n",
|
||
"\n",
|
||
"results for solver newton-cholesky\n",
|
||
"intercept : -3.602198310216717\n",
|
||
"coefficients : [[-0.02297134 -0.05187501 0.07986323 0.1693883 -0.87639043 0.58815512\n",
|
||
" -1.41963236 0.28949836 0.86505556 -1.99695897 0.70307973 0.80688729\n",
|
||
" 0.2849131 0.20610117]]\n",
|
||
"\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/opt/mamba/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"results for solver sag\n",
|
||
"intercept : -1.251116606796448\n",
|
||
"coefficients : [[-0.02952178 -0.05691972 0.08940743 0.18616406 -0.85908081 0.46577384\n",
|
||
" -1.26014292 0.32512459 -1.00339802 -1.84528471 0.15832219 0.24753693\n",
|
||
" 0.26318328 0.21288782]]\n",
|
||
"\n",
|
||
"\n",
|
||
"results for solver saga\n",
|
||
"intercept : -1.112341737293756\n",
|
||
"coefficients : [[-0.03349226 -0.02298918 0.09611619 0.23784438 -0.80928967 0.28520739\n",
|
||
" -1.01029862 0.30172469 -0.99503611 -1.53140972 -0.04449765 0.02363137\n",
|
||
" 0.20352875 0.22580284]]\n",
|
||
"\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/opt/mamba/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
|
||
" warnings.warn(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# difference entre les solveurs (les resultats de statsmodel s'approchent de newtown cholesky)\n",
|
||
"\n",
|
||
"for solver in [\"lbfgs\", \"newton-cg\", \"newton-cholesky\", \"sag\", \"saga\"] :\n",
|
||
" modele_logit = LogisticRegression(penalty=None, solver=solver)\n",
|
||
" modele_logit.fit(X.drop(\"const\", axis=1), y)\n",
|
||
" print(f\"results for solver {solver}\")\n",
|
||
" print(f\"intercept : {modele_logit.intercept_[0]}\")\n",
|
||
" print(f\"coefficients : {modele_logit.coef_}\")\n",
|
||
" print(\"\\n\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "e65ab8d9-54e5-4092-ad75-ac1909cb1f60",
|
||
"metadata": {},
|
||
"source": [
|
||
"on passe au graphique\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 449,
|
||
"id": "f0006351-9b43-449e-81a7-b4510dd55366",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])"
|
||
]
|
||
},
|
||
"execution_count": 449,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# il faut environ alpha = 25k pour annuler tous les coeffs\n",
|
||
"# on utilise pas de balance pour les classes pour le moment car les résultats de statsmodels n equilibrent \n",
|
||
"# pas les classes - on utilisera cette option pr la validation croisee\n",
|
||
"\n",
|
||
"modele_logit = LogisticRegression(penalty=\"l1\", C=1/25000, # class_weight=\"balanced\", \n",
|
||
" solver=\"liblinear\" )\n",
|
||
"modele_logit.fit(X.drop(\"const\", axis=1),y)\n",
|
||
"modele_logit.coef_"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 370,
|
||
"id": "24083a2f-e520-4229-a510-09e352b25cbd",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"params = np.logspace(-5, 5, 11, 10)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 371,
|
||
"id": "9c1c8efe-27e9-4307-82bd-ea356f219ebf",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"results=[]\n",
|
||
"for param in params :\n",
|
||
" modele_logit = LogisticRegression(penalty=\"l1\", C=param, # class_weight=\"balanced\", \n",
|
||
" solver=\"liblinear\" )\n",
|
||
" modele_logit.fit(X.drop(\"const\", axis=1),y)\n",
|
||
" results.append(modele_logit.coef_)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 383,
|
||
"id": "ceaec969-e72e-4520-afaf-7bcf5dad8365",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"results.reverse()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 384,
|
||
"id": "5b7c8d26-d1f8-441f-ab1d-89845e3e1ea3",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[array([[-0.02299412, -0.05192013, 0.0799274 , 0.16931227, -0.87633381,\n",
|
||
" 0.58813399, -1.41967385, 0.28951886, 0.85509191, -1.99754475,\n",
|
||
" 0.70287087, 0.80669243, 0.28498239, 0.2061286 ]]),\n",
|
||
" array([[-0.02299201, -0.05191491, 0.07992075, 0.16931139, -0.87634243,\n",
|
||
" 0.58813708, -1.41968623, 0.28952223, 0.85577021, -1.99756453,\n",
|
||
" 0.70288563, 0.80669012, 0.28498258, 0.20612949]]),\n",
|
||
" array([[-0.02299764, -0.05192605, 0.07993569, 0.16930528, -0.87632586,\n",
|
||
" 0.58811345, -1.41964512, 0.28952983, 0.85374762, -1.99754811,\n",
|
||
" 0.70282334, 0.80664228, 0.28498228, 0.20613025]]),\n",
|
||
" array([[-0.02298949, -0.05191449, 0.07991828, 0.16931317, -0.87634417,\n",
|
||
" 0.58812319, -1.4196808 , 0.2895181 , 0.85546622, -1.99754003,\n",
|
||
" 0.70302758, 0.80684757, 0.28498265, 0.20613162]]),\n",
|
||
" array([[-0.02296458, -0.05187503, 0.07985942, 0.16928133, -0.87628414,\n",
|
||
" 0.5880753 , -1.41959837, 0.28951824, 0.85207105, -1.99743532,\n",
|
||
" 0.70275613, 0.80657079, 0.28497271, 0.20612744]]),\n",
|
||
" array([[-0.02266765, -0.05140588, 0.07913905, 0.16914597, -0.8759943 ,\n",
|
||
" 0.58782322, -1.41931263, 0.28941107, 0.84058764, -1.99706383,\n",
|
||
" 0.70135753, 0.805146 , 0.2849354 , 0.20613043]]),\n",
|
||
" array([[-0.01986108, -0.04710671, 0.07249967, 0.16755623, -0.8727931 ,\n",
|
||
" 0.58521605, -1.41621509, 0.28835319, 0.7063547 , -1.99262169,\n",
|
||
" 0.68764121, 0.79104559, 0.28452484, 0.20613349]]),\n",
|
||
" array([[ 0. , -0.02274081, 0.03249772, 0.15656967, -0.84560728,\n",
|
||
" 0.5601391 , -1.38630664, 0.27683263, 0. , -1.95240872,\n",
|
||
" 0.55820164, 0.65806397, 0.27970382, 0.20620792]]),\n",
|
||
" array([[ 0.00000000e+00, 0.00000000e+00, 1.55329481e-03,\n",
|
||
" 1.30027639e-01, -6.87367967e-01, 3.13022684e-01,\n",
|
||
" -1.08971896e+00, 1.74908692e-01, 0.00000000e+00,\n",
|
||
" -1.67160475e+00, 0.00000000e+00, 0.00000000e+00,\n",
|
||
" 2.21231437e-01, 2.08973175e-01]]),\n",
|
||
" array([[ 0. , 0. , 0. , 0. , 0. ,\n",
|
||
" 0. , -0.2624159 , 0. , -0.01813001, -0.22665172,\n",
|
||
" 0. , 0. , 0. , 0.01487092]]),\n",
|
||
" array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])]"
|
||
]
|
||
},
|
||
"execution_count": 384,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"results"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 392,
|
||
"id": "9f6e6532-c593-4f3a-a718-5f4593749eb4",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,\n",
|
||
" 1.e+03, 1.e+04, 1.e+05])"
|
||
]
|
||
},
|
||
"execution_count": 392,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# le paramètre C est l'inverse de alpha. On préfère donc afficher les valeurs de alpha qui sont plus parlantes\n",
|
||
"# un alpha grand correspond à une plus grande pénalité \n",
|
||
"# et on utilise flip pour inverser le vecteur, et classer les alphas par ordre croissant\n",
|
||
"# par souci de coherence et de lisibilité, on inverse donc aussi l'ordre des resultats\n",
|
||
"\n",
|
||
"alphas_sorted = np.flip(1/params)\n",
|
||
"alphas_sorted"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 447,
|
||
"id": "1de056b5-e37c-4272-9acb-a197bdb5ea3b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
|
||
" 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',\n",
|
||
" 'nb_tickets_internet', 'is_email_true', 'opt_in', 'gender_female',\n",
|
||
" 'gender_male', 'nb_campaigns', 'nb_campaigns_opened'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 447,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X_colnames = X.drop(\"const\", axis=1).columns\n",
|
||
"X_colnames"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 448,
|
||
"id": "4436abe2-ac0f-480d-aa12-491c059f906a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1320x880 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# graphique\n",
|
||
"\n",
|
||
"plt.figure(figsize=[12,8], dpi=110)\n",
|
||
"\n",
|
||
"for i in range(len(X_colnames)) :\n",
|
||
" var_name = X_colnames[i]\n",
|
||
" plt.plot(alphas_sorted, [results[p][0][i] for p in range(len(results))], label = var_name)\n",
|
||
"\n",
|
||
"plt.legend()\n",
|
||
"plt.title(\"Evolution de la valeur des coefficents du logit LASSO en fonction du paramètre de pénalité alpha\")\n",
|
||
"plt.xlabel(\"alpha\")\n",
|
||
"plt.ylabel(\"valeur du coefficient\")\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 444,
|
||
"id": "4771b91f-baff-493b-a6f7-ddce02164333",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1320x880 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# hide right part of the graphic\n",
|
||
"# some coefficients are still strictly positive even for alpha =10k, which makes the graphic quite confusing\n",
|
||
"# alternative syntax\n",
|
||
"\n",
|
||
"endpoint = 9\n",
|
||
"\n",
|
||
"fig, ax = plt.subplots(figsize=[12,8], dpi=110)\n",
|
||
"\n",
|
||
"for i in range(len(X_colnames)) :\n",
|
||
" var_name = X_colnames[i]\n",
|
||
" ax.plot(alphas_sorted[:endpoint], [results[p][0][i] for p in range(len(results[:endpoint]))], label=var_name)\n",
|
||
" \n",
|
||
"ax.set(xlabel=\"alpha\",\n",
|
||
" ylabel=\"valeur du coefficient\",\n",
|
||
" title = \"Evolution de la valeur des coefficents du logit LASSO en fonction du paramètre de pénalité alpha\")\n",
|
||
"ax.legend()\n",
|
||
"plt.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "c3c9bb8c-5d8b-47a6-b0b5-273217ff2664",
|
||
"metadata": {},
|
||
"source": [
|
||
"A retenir : \\\n",
|
||
"D'après le premier tableau de résultats, toutes les variables sont significatives au seuil de 5%, et à l'exception de nb tickets, elles sont même significatives à 0.1%. \\\n",
|
||
"Le graphique ci-dessus confirme que opt in, purchase date max, ventes internet max sont très importantes dans le modèle (on l'avait déjà remarqué car les valeurs des coefficients étaient élevées). \\\n",
|
||
"Au contraire, des variables qui avaient un fort coefficient comme is email true (0.87) se trouvent finalement fortement pénalisées et tombent plus vite à 0 que les autres. "
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.6"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|