{ "cells": [ { "cell_type": "markdown", "id": "56949d8f-4eaf-4685-9989-ba0b4b1945b7", "metadata": {}, "source": [ "# Baseline logit on spectacle companies with statmodels" ] }, { "cell_type": "code", "execution_count": 1, "id": "72480e84-2ccc-481a-9353-1199e4358d62", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n", "from sklearn.utils import class_weight\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n", "\n", "import statsmodels.api as sm\n", "\n", "import pickle\n", "import warnings" ] }, { "cell_type": "code", "execution_count": 2, "id": "7090dc21-7889-4776-a0a4-f7c6a5416d53", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 9, "id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7", "metadata": {}, "outputs": [], "source": [ "def load_train_test():\n", " BUCKET = \"projet-bdc2324-team1/Generalization/musique\"\n", " File_path_train = BUCKET + \"/Train_set.csv\"\n", " File_path_test = BUCKET + \"/Test_set.csv\"\n", " \n", " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", " # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", "\n", " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n", " # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", " \n", " return dataset_train, dataset_test" ] }, { "cell_type": "code", "execution_count": 4, "id": "438d0138-a254-464c-9e94-f7436576c1d5", "metadata": {}, "outputs": [], "source": [ "def features_target_split(dataset_train, dataset_test):\n", " features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n", " 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n", " 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n", " X_train = dataset_train[features_l]\n", " y_train = dataset_train[['y_has_purchased']]\n", "\n", " X_test = dataset_test[features_l]\n", " y_test = dataset_test[['y_has_purchased']]\n", " return X_train, X_test, y_train, y_test" ] }, { "cell_type": "code", "execution_count": 5, "id": "ebe9a887-61a4-4a5e-ac64-231307dd7647", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", "/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "dataset_train, dataset_test = load_train_test()" ] }, { "cell_type": "code", "execution_count": 6, "id": "b21fdea2-02c4-4222-b4e0-635e423f91c2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "customer_id 0\n", "nb_tickets 0\n", "nb_purchases 0\n", "total_amount 0\n", "nb_suppliers 0\n", "vente_internet_max 0\n", "purchase_date_min 0\n", "purchase_date_max 0\n", "time_between_purchase 0\n", "nb_tickets_internet 0\n", "street_id 0\n", "structure_id 327067\n", "mcp_contact_id 135224\n", "fidelity 0\n", "tenant_id 0\n", "is_partner 0\n", "deleted_at 354365\n", "gender 0\n", "is_email_true 0\n", "opt_in 0\n", "last_buying_date 119201\n", "max_price 119201\n", "ticket_sum 0\n", "average_price 115193\n", "average_purchase_delay 119203\n", "average_price_basket 119203\n", "average_ticket_basket 119203\n", "total_price 4008\n", "purchase_count 0\n", "first_buying_date 119201\n", "country 56856\n", "gender_label 0\n", "gender_female 0\n", "gender_male 0\n", "gender_other 0\n", "country_fr 56856\n", "nb_campaigns 0\n", "nb_campaigns_opened 0\n", "time_to_open 224310\n", "y_has_purchased 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train.isna().sum()" ] }, { "cell_type": "code", "execution_count": 7, "id": "42c4d034-8bc1-4ebb-a1ff-60c0a86f8f7c", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)" ] }, { "cell_type": "code", "execution_count": 8, "id": "94b4498d-6ae8-4c96-adbc-7ba1b8348160", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape train : (354365, 17)\n", "Shape test : (151874, 17)\n" ] } ], "source": [ "print(\"Shape train : \", X_train.shape)\n", "print(\"Shape test : \", X_test.shape)" ] }, { "cell_type": "code", "execution_count": 10, "id": "6224fd31-c190-4168-b395-e0bf5806d79d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0.0: 0.5481283836040216, 1.0: 5.694439980716696}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Compute Weights\n", "weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n", " y = y_train['y_has_purchased'])\n", "\n", "weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n", "weight_dict" ] }, { "cell_type": "code", "execution_count": 58, "id": "4680f202-979e-483f-89b8-9df877203bcf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.54812838, 0.54812838, 0.54812838, ..., 5.69443998, 0.54812838,\n", " 0.54812838])" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Calcul des poids inverses à la fréquence des classes\n", "class_counts = np.bincount(y_train['y_has_purchased'])\n", "class_weights = len(y_train['y_has_purchased']) / (2 * class_counts)\n", "\n", "# Sélection des poids correspondants à chaque observation\n", "weights = class_weights[y_train['y_has_purchased'].values.astype(int)]\n", "weights" ] }, { "cell_type": "code", "execution_count": 65, "id": "5f747be4-e70b-491c-8f0a-46cb278a2dee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[354365. 354365. 354365. ... 354365. 354365. 354365.]\n", "354365\n" ] } ], "source": [ "print(2 * weights * class_counts[y_train['y_has_purchased'].values.astype(int)])\n", "print(len(y_train['y_has_purchased']))" ] }, { "cell_type": "code", "execution_count": 124, "id": "648fb542-0186-493d-b274-be2c26a11967", "metadata": {}, "outputs": [], "source": [ "# model logit\n", "X = X_train.astype(int)\n", "# X = sm.add_constant(X.drop(\"gender_other\", axis=1))\n", "y = y_train['y_has_purchased'].values\n", "\n", "# print(X,y)" ] }, { "cell_type": "code", "execution_count": 125, "id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelityis_email_trueopt_ingender_femalegender_malegender_othernb_campaignsnb_campaigns_opened
000000550550-10111100134
100000550550-10011001109
200000550550-10111010140
300000550550-1001000190
400000550550-1001000140
......................................................
35436000000550550-1001000170
35436100000550550-10011010112
35436222501091910041010066
35436311551052520011101030
35436400000550550-1001001070
\n", "

354365 rows × 17 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "... ... ... ... ... \n", "354360 0 0 0 0 \n", "354361 0 0 0 0 \n", "354362 2 2 50 1 \n", "354363 1 1 55 1 \n", "354364 0 0 0 0 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 0 550 550 \n", "1 0 550 550 \n", "2 0 550 550 \n", "3 0 550 550 \n", "4 0 550 550 \n", "... ... ... ... \n", "354360 0 550 550 \n", "354361 0 550 550 \n", "354362 0 91 91 \n", "354363 0 52 52 \n", "354364 0 550 550 \n", "\n", " time_between_purchase nb_tickets_internet fidelity is_email_true \\\n", "0 -1 0 1 1 \n", "1 -1 0 0 1 \n", "2 -1 0 1 1 \n", "3 -1 0 0 1 \n", "4 -1 0 0 1 \n", "... ... ... ... ... \n", "354360 -1 0 0 1 \n", "354361 -1 0 0 1 \n", "354362 0 0 4 1 \n", "354363 0 0 1 1 \n", "354364 -1 0 0 1 \n", "\n", " opt_in gender_female gender_male gender_other nb_campaigns \\\n", "0 1 1 0 0 13 \n", "1 1 0 0 1 10 \n", "2 1 0 1 0 14 \n", "3 0 0 0 1 9 \n", "4 0 0 0 1 4 \n", "... ... ... ... ... ... \n", "354360 0 0 0 1 7 \n", "354361 1 0 1 0 11 \n", "354362 0 1 0 0 6 \n", "354363 1 0 1 0 3 \n", "354364 0 0 1 0 7 \n", "\n", " nb_campaigns_opened \n", "0 4 \n", "1 9 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "354360 0 \n", "354361 2 \n", "354362 6 \n", "354363 0 \n", "354364 0 \n", "\n", "[354365 rows x 17 columns]" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 138, "id": "81b38ceb-5005-417d-a9a6-b2dac181a8fb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
purchase_date_minpurchase_date_max
count354365.000000354365.000000
mean406.981861396.551502
std189.343612195.881681
min0.0096400.000000
25%188.475293153.457966
50%550.000000550.000000
75%550.000000550.000000
max550.000000550.000000
\n", "
" ], "text/plain": [ " purchase_date_min purchase_date_max\n", "count 354365.000000 354365.000000\n", "mean 406.981861 396.551502\n", "std 189.343612 195.881681\n", "min 0.009640 0.000000\n", "25% 188.475293 153.457966\n", "50% 550.000000 550.000000\n", "75% 550.000000 550.000000\n", "max 550.000000 550.000000" ] }, "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[[\"purchase_date_min\", \"purchase_date_max\"]].describe()" ] }, { "cell_type": "code", "execution_count": 143, "id": "60effd66-2914-4cf9-aa0c-4e2f9dd13895", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 354365.000000\n", "mean 10.430360\n", "std 56.442718\n", "min 0.000000\n", "25% 0.000000\n", "50% 0.000000\n", "75% 0.000000\n", "max 547.443350\n", "dtype: float64" ] }, "execution_count": 143, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(X_train[\"purchase_date_min\"] - X_train[\"purchase_date_max\"]).describe()" ] }, { "cell_type": "code", "execution_count": 145, "id": "7a99e480-9e11-448d-806e-3b71925a19db", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelityis_email_trueopt_ingender_femalegender_malegender_othernb_campaignsnb_campaigns_opened
00.00.00.00.00.0550.0550.0-1.00.01TrueTrue10013.04.0
10.00.00.00.00.0550.0550.0-1.00.00TrueTrue00110.09.0
20.00.00.00.00.0550.0550.0-1.00.01TrueTrue01014.00.0
30.00.00.00.00.0550.0550.0-1.00.00TrueFalse0019.00.0
40.00.00.00.00.0550.0550.0-1.00.00TrueFalse0014.00.0
......................................................
3543580.00.00.00.00.0550.0550.0-1.00.00TrueFalse1001.00.0
3543590.00.00.00.00.0550.0550.0-1.00.00TrueTrue01012.02.0
3543600.00.00.00.00.0550.0550.0-1.00.00TrueFalse0017.00.0
3543610.00.00.00.00.0550.0550.0-1.00.00TrueTrue01011.02.0
3543640.00.00.00.00.0550.0550.0-1.00.00TrueFalse0107.00.0
\n", "

179675 rows × 17 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "354358 0.0 0.0 0.0 0.0 \n", "354359 0.0 0.0 0.0 0.0 \n", "354360 0.0 0.0 0.0 0.0 \n", "354361 0.0 0.0 0.0 0.0 \n", "354364 0.0 0.0 0.0 0.0 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 0.0 550.0 550.0 \n", "1 0.0 550.0 550.0 \n", "2 0.0 550.0 550.0 \n", "3 0.0 550.0 550.0 \n", "4 0.0 550.0 550.0 \n", "... ... ... ... \n", "354358 0.0 550.0 550.0 \n", "354359 0.0 550.0 550.0 \n", "354360 0.0 550.0 550.0 \n", "354361 0.0 550.0 550.0 \n", "354364 0.0 550.0 550.0 \n", "\n", " time_between_purchase nb_tickets_internet fidelity is_email_true \\\n", "0 -1.0 0.0 1 True \n", "1 -1.0 0.0 0 True \n", "2 -1.0 0.0 1 True \n", "3 -1.0 0.0 0 True \n", "4 -1.0 0.0 0 True \n", "... ... ... ... ... \n", "354358 -1.0 0.0 0 True \n", "354359 -1.0 0.0 0 True \n", "354360 -1.0 0.0 0 True \n", "354361 -1.0 0.0 0 True \n", "354364 -1.0 0.0 0 True \n", "\n", " opt_in gender_female gender_male gender_other nb_campaigns \\\n", "0 True 1 0 0 13.0 \n", "1 True 0 0 1 10.0 \n", "2 True 0 1 0 14.0 \n", "3 False 0 0 1 9.0 \n", "4 False 0 0 1 4.0 \n", "... ... ... ... ... ... \n", "354358 False 1 0 0 1.0 \n", "354359 True 0 1 0 12.0 \n", "354360 False 0 0 1 7.0 \n", "354361 True 0 1 0 11.0 \n", "354364 False 0 1 0 7.0 \n", "\n", " nb_campaigns_opened \n", "0 4.0 \n", "1 9.0 \n", "2 0.0 \n", "3 0.0 \n", "4 0.0 \n", "... ... \n", "354358 0.0 \n", "354359 2.0 \n", "354360 0.0 \n", "354361 2.0 \n", "354364 0.0 \n", "\n", "[179675 rows x 17 columns]" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[X_train[\"time_between_purchase\"]==-1]" ] }, { "cell_type": "code", "execution_count": 126, "id": "2475f2fe-3d1f-4845-9ede-0416dac83271", "metadata": {}, "outputs": [], "source": [ "# Colonnes à standardiser\n", "\n", "\"\"\"\n", "var_num = ['nb_tickets', 'nb_purchases', \"total_amount\", \"nb_suppliers\", \"vente_internet_max\",\n", " \"purchase_date_min\", \"purchase_date_max\", \"time_between_purchase\", \"nb_tickets_internet\",\n", " \"fidelity\", \"nb_campaigns\", \"nb_campaigns_opened\"]\n", " \"\"\"\n", "\n", "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n", " 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n", " 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", "# Standardisation des colonnes sélectionnées\n", "scaler = StandardScaler()\n", "X[var_num] = scaler.fit_transform(X[var_num])\n", "X[numeric_features] = scaler.fit_transform(X[numeric_features])\n", "\n" ] }, { "cell_type": "code", "execution_count": 128, "id": "1763bad4-36b5-4ebb-9702-b77ba19fb30e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelityis_email_trueopt_ingender_femalegender_malegender_othernb_campaignsnb_campaigns_opened
0-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.0043160.0581931.1511861.071079-0.775306-0.4345680.6079450.522567
1-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.0290710.0581931.151186-0.933638-0.7753062.3011370.3061551.701843
2-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.0043160.0581931.151186-0.9336381.289813-0.4345680.708542-0.420854
3-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.0290710.058193-0.868669-0.933638-0.7753062.3011370.205558-0.420854
4-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.0290710.058193-0.868669-0.933638-0.7753062.301137-0.297426-0.420854
......................................................
354360-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.0290710.058193-0.868669-0.933638-0.7753062.3011370.004365-0.420854
354361-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.0290710.0581931.151186-0.9336381.289813-0.4345680.4067520.050856
354362-0.0008380.092966-0.0091501.219633-0.599511-1.665887-1.557073-0.175269-0.2646930.0699490.058193-0.8686691.071079-0.775306-0.434568-0.0962320.994277
354363-0.0126310.021122-0.0052271.219633-0.599511-1.871668-1.755983-0.175269-0.264693-0.0043160.0581931.151186-0.9336381.289813-0.434568-0.398023-0.420854
354364-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.0290710.058193-0.868669-0.9336381.289813-0.4345680.004365-0.420854
\n", "

354365 rows × 17 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 -0.024425 -0.050722 -0.048383 -0.768294 \n", "1 -0.024425 -0.050722 -0.048383 -0.768294 \n", "2 -0.024425 -0.050722 -0.048383 -0.768294 \n", "3 -0.024425 -0.050722 -0.048383 -0.768294 \n", "4 -0.024425 -0.050722 -0.048383 -0.768294 \n", "... ... ... ... ... \n", "354360 -0.024425 -0.050722 -0.048383 -0.768294 \n", "354361 -0.024425 -0.050722 -0.048383 -0.768294 \n", "354362 -0.000838 0.092966 -0.009150 1.219633 \n", "354363 -0.012631 0.021122 -0.005227 1.219633 \n", "354364 -0.024425 -0.050722 -0.048383 -0.768294 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 -0.599511 0.755994 0.783940 \n", "1 -0.599511 0.755994 0.783940 \n", "2 -0.599511 0.755994 0.783940 \n", "3 -0.599511 0.755994 0.783940 \n", "4 -0.599511 0.755994 0.783940 \n", "... ... ... ... \n", "354360 -0.599511 0.755994 0.783940 \n", "354361 -0.599511 0.755994 0.783940 \n", "354362 -0.599511 -1.665887 -1.557073 \n", "354363 -0.599511 -1.871668 -1.755983 \n", "354364 -0.599511 0.755994 0.783940 \n", "\n", " time_between_purchase nb_tickets_internet fidelity is_email_true \\\n", "0 -0.192978 -0.264693 -0.004316 0.058193 \n", "1 -0.192978 -0.264693 -0.029071 0.058193 \n", "2 -0.192978 -0.264693 -0.004316 0.058193 \n", "3 -0.192978 -0.264693 -0.029071 0.058193 \n", "4 -0.192978 -0.264693 -0.029071 0.058193 \n", "... ... ... ... ... \n", "354360 -0.192978 -0.264693 -0.029071 0.058193 \n", "354361 -0.192978 -0.264693 -0.029071 0.058193 \n", "354362 -0.175269 -0.264693 0.069949 0.058193 \n", "354363 -0.175269 -0.264693 -0.004316 0.058193 \n", "354364 -0.192978 -0.264693 -0.029071 0.058193 \n", "\n", " opt_in gender_female gender_male gender_other nb_campaigns \\\n", "0 1.151186 1.071079 -0.775306 -0.434568 0.607945 \n", "1 1.151186 -0.933638 -0.775306 2.301137 0.306155 \n", "2 1.151186 -0.933638 1.289813 -0.434568 0.708542 \n", "3 -0.868669 -0.933638 -0.775306 2.301137 0.205558 \n", "4 -0.868669 -0.933638 -0.775306 2.301137 -0.297426 \n", "... ... ... ... ... ... \n", "354360 -0.868669 -0.933638 -0.775306 2.301137 0.004365 \n", "354361 1.151186 -0.933638 1.289813 -0.434568 0.406752 \n", "354362 -0.868669 1.071079 -0.775306 -0.434568 -0.096232 \n", "354363 1.151186 -0.933638 1.289813 -0.434568 -0.398023 \n", "354364 -0.868669 -0.933638 1.289813 -0.434568 0.004365 \n", "\n", " nb_campaigns_opened \n", "0 0.522567 \n", "1 1.701843 \n", "2 -0.420854 \n", "3 -0.420854 \n", "4 -0.420854 \n", "... ... \n", "354360 -0.420854 \n", "354361 0.050856 \n", "354362 0.994277 \n", "354363 -0.420854 \n", "354364 -0.420854 \n", "\n", "[354365 rows x 17 columns]" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 122, "id": "23d6c06c-8708-4714-906b-a1ed664377bb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
constnb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelityis_email_trueopt_ingender_femalegender_malenb_campaignsnb_campaigns_opened
01.0-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.00431611100.6079450.522567
11.0-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.02907111000.3061551.701843
21.0-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.00431611010.708542-0.420854
31.0-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.02907110000.205558-0.420854
41.0-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.0290711000-0.297426-0.420854
......................................................
3543601.0-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.02907110000.004365-0.420854
3543611.0-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.02907111010.4067520.050856
3543621.0-0.0008380.092966-0.0091501.219633-0.599511-1.665887-1.557073-0.175269-0.2646930.0699491010-0.0962320.994277
3543631.0-0.0126310.021122-0.0052271.219633-0.599511-1.871668-1.755983-0.175269-0.264693-0.0043161101-0.398023-0.420854
3543641.0-0.024425-0.050722-0.048383-0.768294-0.5995110.7559940.783940-0.192978-0.264693-0.02907110010.004365-0.420854
\n", "

354365 rows × 17 columns

\n", "
" ], "text/plain": [ " const nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n", "1 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n", "2 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n", "3 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n", "4 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n", "... ... ... ... ... ... \n", "354360 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n", "354361 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n", "354362 1.0 -0.000838 0.092966 -0.009150 1.219633 \n", "354363 1.0 -0.012631 0.021122 -0.005227 1.219633 \n", "354364 1.0 -0.024425 -0.050722 -0.048383 -0.768294 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 -0.599511 0.755994 0.783940 \n", "1 -0.599511 0.755994 0.783940 \n", "2 -0.599511 0.755994 0.783940 \n", "3 -0.599511 0.755994 0.783940 \n", "4 -0.599511 0.755994 0.783940 \n", "... ... ... ... \n", "354360 -0.599511 0.755994 0.783940 \n", "354361 -0.599511 0.755994 0.783940 \n", "354362 -0.599511 -1.665887 -1.557073 \n", "354363 -0.599511 -1.871668 -1.755983 \n", "354364 -0.599511 0.755994 0.783940 \n", "\n", " time_between_purchase nb_tickets_internet fidelity is_email_true \\\n", "0 -0.192978 -0.264693 -0.004316 1 \n", "1 -0.192978 -0.264693 -0.029071 1 \n", "2 -0.192978 -0.264693 -0.004316 1 \n", "3 -0.192978 -0.264693 -0.029071 1 \n", "4 -0.192978 -0.264693 -0.029071 1 \n", "... ... ... ... ... \n", "354360 -0.192978 -0.264693 -0.029071 1 \n", "354361 -0.192978 -0.264693 -0.029071 1 \n", "354362 -0.175269 -0.264693 0.069949 1 \n", "354363 -0.175269 -0.264693 -0.004316 1 \n", "354364 -0.192978 -0.264693 -0.029071 1 \n", "\n", " opt_in gender_female gender_male nb_campaigns nb_campaigns_opened \n", "0 1 1 0 0.607945 0.522567 \n", "1 1 0 0 0.306155 1.701843 \n", "2 1 0 1 0.708542 -0.420854 \n", "3 0 0 0 0.205558 -0.420854 \n", "4 0 0 0 -0.297426 -0.420854 \n", "... ... ... ... ... ... \n", "354360 0 0 0 0.004365 -0.420854 \n", "354361 1 0 1 0.406752 0.050856 \n", "354362 0 1 0 -0.096232 0.994277 \n", "354363 1 0 1 -0.398023 -0.420854 \n", "354364 0 0 1 0.004365 -0.420854 \n", "\n", "[354365 rows x 17 columns]" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 133, "id": "0e968aa1-fbec-47db-b570-4730ef7eebf2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/mamba/lib/python3.11/site-packages/statsmodels/discrete/discrete_model.py:2385: RuntimeWarning: overflow encountered in exp\n", " return 1/(1+np.exp(-X))\n", "/opt/mamba/lib/python3.11/site-packages/statsmodels/discrete/discrete_model.py:2443: RuntimeWarning: divide by zero encountered in log\n", " return np.sum(np.log(self.cdf(q * linpred)))\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Warning: Maximum number of iterations has been exceeded.\n", " Current function value: inf\n", " Iterations: 35\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/opt/mamba/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n", " warnings.warn(\"Maximum Likelihood optimization failed to \"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Logit Regression Results \n", "==============================================================================\n", "Dep. Variable: y No. Observations: 354365\n", "Model: Logit Df Residuals: 354349\n", "Method: MLE Df Model: 15\n", "Date: Thu, 14 Mar 2024 Pseudo R-squ.: -inf\n", "Time: 10:47:16 Log-Likelihood: -inf\n", "converged: False LL-Null: -1.0540e+05\n", "Covariance Type: nonrobust LLR p-value: 1.000\n", "=========================================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "-----------------------------------------------------------------------------------------\n", "nb_tickets 4.9213 0.267 18.448 0.000 4.398 5.444\n", "nb_purchases -7.9446 0.140 -56.905 0.000 -8.218 -7.671\n", "total_amount 0.3039 0.061 4.945 0.000 0.183 0.424\n", "nb_suppliers 0.1067 0.008 13.678 0.000 0.091 0.122\n", "vente_internet_max -0.2784 0.008 -34.612 0.000 -0.294 -0.263\n", "purchase_date_min -41.9693 2.640 -15.895 0.000 -47.144 -36.794\n", "purchase_date_max 43.2793 2.734 15.829 0.000 37.920 48.638\n", "time_between_purchase 12.7237 0.789 16.132 0.000 11.178 14.270\n", "nb_tickets_internet -0.0212 0.014 -1.510 0.131 -0.049 0.006\n", "fidelity 22.0749 0.222 99.561 0.000 21.640 22.509\n", "is_email_true 0.0225 0.004 6.145 0.000 0.015 0.030\n", "opt_in -0.1245 0.004 -30.646 0.000 -0.133 -0.117\n", "gender_female 0.0018 nan nan nan nan nan\n", "gender_male 0.0118 nan nan nan nan nan\n", "gender_other -0.0182 nan nan nan nan nan\n", "nb_campaigns -0.0049 0.005 -0.961 0.336 -0.015 0.005\n", "nb_campaigns_opened 0.0867 0.005 18.211 0.000 0.077 0.096\n", "=========================================================================================\n" ] } ], "source": [ "# Création du modèle de régression logistique avec poids équilibrés\n", "# model_logit = sm.Logit(y, X, weights=weights)\n", "model_logit = sm.Logit(y, X)\n", "\n", "# Ajustement du modèle aux données\n", "result = model_logit.fit()\n", "\n", "# Affichage des résultats\n", "print(result.summary())" ] }, { "cell_type": "code", "execution_count": 130, "id": "d1660ef9-438f-4427-ac2d-aa8179614e40", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1.07107945, -0.93363755])" ] }, "execution_count": 130, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X[\"gender_female\"].unique()" ] }, { "cell_type": "code", "execution_count": 131, "id": "2079bae6-bce3-4de7-bf49-180177c31a55", "metadata": {}, "outputs": [], "source": [ "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n", " 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n", " 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", "numeric_transformer = Pipeline(steps=[\n", " #(\"imputer\", SimpleImputer(strategy=\"mean\")), \n", " (\"scaler\", StandardScaler()) \n", "])\n", "\n", "categorical_features = ['opt_in'] \n", "\n", "# Transformer for the categorical features\n", "categorical_transformer = Pipeline(steps=[\n", " #(\"imputer\", SimpleImputer(strategy=\"most_frequent\")), # Impute missing values with the most frequent\n", " (\"onehot\", OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n", "])\n", "\n", "preproc = ColumnTransformer(\n", " transformers=[\n", " (\"num\", numeric_transformer, numeric_features),\n", " (\"cat\", categorical_transformer, categorical_features)\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 105, "id": "a9fe1c60-0732-426f-b176-9c95718e546f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
constgender_other
01.00
11.01
21.00
31.01
41.01
.........
3543601.01
3543611.00
3543621.00
3543631.00
3543641.00
\n", "

354365 rows × 2 columns

\n", "
" ], "text/plain": [ " const gender_other\n", "0 1.0 0\n", "1 1.0 1\n", "2 1.0 0\n", "3 1.0 1\n", "4 1.0 1\n", "... ... ...\n", "354360 1.0 1\n", "354361 1.0 0\n", "354362 1.0 0\n", "354363 1.0 0\n", "354364 1.0 0\n", "\n", "[354365 rows x 2 columns]" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sm.add_constant(X[\"gender_other\"])" ] }, { "cell_type": "code", "execution_count": 106, "id": "b8c92b7c-1df0-4384-82e7-1e8cc0d333fa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelityis_email_trueopt_ingender_femalegender_malenb_campaignsnb_campaigns_opened
000000550550-1011110134
100000550550-1001100109
200000550550-1011101140
300000550550-100100090
400000550550-100100040
...................................................
35436000000550550-100100070
35436100000550550-1001101112
3543622250109191004101066
3543631155105252001110130
35436400000550550-100100170
\n", "

354365 rows × 16 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "... ... ... ... ... \n", "354360 0 0 0 0 \n", "354361 0 0 0 0 \n", "354362 2 2 50 1 \n", "354363 1 1 55 1 \n", "354364 0 0 0 0 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 0 550 550 \n", "1 0 550 550 \n", "2 0 550 550 \n", "3 0 550 550 \n", "4 0 550 550 \n", "... ... ... ... \n", "354360 0 550 550 \n", "354361 0 550 550 \n", "354362 0 91 91 \n", "354363 0 52 52 \n", "354364 0 550 550 \n", "\n", " time_between_purchase nb_tickets_internet fidelity is_email_true \\\n", "0 -1 0 1 1 \n", "1 -1 0 0 1 \n", "2 -1 0 1 1 \n", "3 -1 0 0 1 \n", "4 -1 0 0 1 \n", "... ... ... ... ... \n", "354360 -1 0 0 1 \n", "354361 -1 0 0 1 \n", "354362 0 0 4 1 \n", "354363 0 0 1 1 \n", "354364 -1 0 0 1 \n", "\n", " opt_in gender_female gender_male nb_campaigns nb_campaigns_opened \n", "0 1 1 0 13 4 \n", "1 1 0 0 10 9 \n", "2 1 0 1 14 0 \n", "3 0 0 0 9 0 \n", "4 0 0 0 4 0 \n", "... ... ... ... ... ... \n", "354360 0 0 0 7 0 \n", "354361 1 0 1 11 2 \n", "354362 0 1 0 6 6 \n", "354363 1 0 1 3 0 \n", "354364 0 0 1 7 0 \n", "\n", "[354365 rows x 16 columns]" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.drop(\"gender_other\", axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "061dcabd-383d-4b76-a9f0-8647daed2c9e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 107, "id": "fc4ffbf6-ab7e-47cf-a717-c25477d92493", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelityis_email_trueopt_ingender_femalegender_malegender_othernb_campaignsnb_campaigns_opened
000000550550-10111100134
100000550550-10011001109
200000550550-10111010140
300000550550-1001000190
400000550550-1001000140
......................................................
35436000000550550-1001000170
35436100000550550-10011010112
35436222501091910041010066
35436311551052520011101030
35436400000550550-1001001070
\n", "

354365 rows × 17 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "... ... ... ... ... \n", "354360 0 0 0 0 \n", "354361 0 0 0 0 \n", "354362 2 2 50 1 \n", "354363 1 1 55 1 \n", "354364 0 0 0 0 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 0 550 550 \n", "1 0 550 550 \n", "2 0 550 550 \n", "3 0 550 550 \n", "4 0 550 550 \n", "... ... ... ... \n", "354360 0 550 550 \n", "354361 0 550 550 \n", "354362 0 91 91 \n", "354363 0 52 52 \n", "354364 0 550 550 \n", "\n", " time_between_purchase nb_tickets_internet fidelity is_email_true \\\n", "0 -1 0 1 1 \n", "1 -1 0 0 1 \n", "2 -1 0 1 1 \n", "3 -1 0 0 1 \n", "4 -1 0 0 1 \n", "... ... ... ... ... \n", "354360 -1 0 0 1 \n", "354361 -1 0 0 1 \n", "354362 0 0 4 1 \n", "354363 0 0 1 1 \n", "354364 -1 0 0 1 \n", "\n", " opt_in gender_female gender_male gender_other nb_campaigns \\\n", "0 1 1 0 0 13 \n", "1 1 0 0 1 10 \n", "2 1 0 1 0 14 \n", "3 0 0 0 1 9 \n", "4 0 0 0 1 4 \n", "... ... ... ... ... ... \n", "354360 0 0 0 1 7 \n", "354361 1 0 1 0 11 \n", "354362 0 1 0 0 6 \n", "354363 1 0 1 0 3 \n", "354364 0 0 1 0 7 \n", "\n", " nb_campaigns_opened \n", "0 4 \n", "1 9 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "354360 0 \n", "354361 2 \n", "354362 6 \n", "354363 0 \n", "354364 0 \n", "\n", "[354365 rows x 17 columns]" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X" ] }, { "cell_type": "code", "execution_count": 73, "id": "f15b0d69-8470-4a36-bd25-9536a36c4756", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(354365,)" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "weights.shape" ] }, { "cell_type": "code", "execution_count": 74, "id": "e97e26f6-b854-41e3-bbdf-318065b03254", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(354365, 17)" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape" ] }, { "cell_type": "code", "execution_count": 75, "id": "49621874-1e8c-4cb5-84a9-a5c9715f3b06", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(354365,)" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.shape" ] }, { "cell_type": "code", "execution_count": 76, "id": "8072cd81-d63f-430e-b0b2-c0589cf18871", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "nb_tickets 0\n", "nb_purchases 0\n", "total_amount 0\n", "nb_suppliers 0\n", "vente_internet_max 0\n", "purchase_date_min 0\n", "purchase_date_max 0\n", "time_between_purchase 0\n", "nb_tickets_internet 0\n", "fidelity 0\n", "is_email_true 0\n", "opt_in 0\n", "gender_female 0\n", "gender_male 0\n", "gender_other 0\n", "nb_campaigns 0\n", "nb_campaigns_opened 0\n", "dtype: int64" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.isna().sum()" ] }, { "cell_type": "code", "execution_count": 80, "id": "6f07a66f-5a46-4409-b0b6-ff5e212296f0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0., 1.])" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_train[\"y_has_purchased\"].unique()" ] }, { "cell_type": "code", "execution_count": 134, "id": "4587c36f-94bf-458b-b819-60250eb17c59", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelityis_email_trueopt_ingender_femalegender_malegender_othernb_campaignsnb_campaigns_opened
00.00.00.00.00.0550.000000550.000000-1.0000000.01TrueTrue10013.04.0
10.00.00.00.00.0550.000000550.000000-1.0000000.00TrueTrue00110.09.0
20.00.00.00.00.0550.000000550.000000-1.0000000.01TrueTrue01014.00.0
30.00.00.00.00.0550.000000550.000000-1.0000000.00TrueFalse0019.00.0
40.00.00.00.00.0550.000000550.000000-1.0000000.00TrueFalse0014.00.0
......................................................
3543600.00.00.00.00.0550.000000550.000000-1.0000000.00TrueFalse0017.00.0
3543610.00.00.00.00.0550.000000550.000000-1.0000000.00TrueTrue01011.02.0
3543622.02.050.01.00.091.03055691.0201390.0104170.04TrueFalse1006.06.0
3543631.01.055.01.00.052.28402852.2840280.0000000.01TrueTrue0103.00.0
3543640.00.00.00.00.0550.000000550.000000-1.0000000.00TrueFalse0107.00.0
\n", "

354365 rows × 17 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "354360 0.0 0.0 0.0 0.0 \n", "354361 0.0 0.0 0.0 0.0 \n", "354362 2.0 2.0 50.0 1.0 \n", "354363 1.0 1.0 55.0 1.0 \n", "354364 0.0 0.0 0.0 0.0 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 0.0 550.000000 550.000000 \n", "1 0.0 550.000000 550.000000 \n", "2 0.0 550.000000 550.000000 \n", "3 0.0 550.000000 550.000000 \n", "4 0.0 550.000000 550.000000 \n", "... ... ... ... \n", "354360 0.0 550.000000 550.000000 \n", "354361 0.0 550.000000 550.000000 \n", "354362 0.0 91.030556 91.020139 \n", "354363 0.0 52.284028 52.284028 \n", "354364 0.0 550.000000 550.000000 \n", "\n", " time_between_purchase nb_tickets_internet fidelity is_email_true \\\n", "0 -1.000000 0.0 1 True \n", "1 -1.000000 0.0 0 True \n", "2 -1.000000 0.0 1 True \n", "3 -1.000000 0.0 0 True \n", "4 -1.000000 0.0 0 True \n", "... ... ... ... ... \n", "354360 -1.000000 0.0 0 True \n", "354361 -1.000000 0.0 0 True \n", "354362 0.010417 0.0 4 True \n", "354363 0.000000 0.0 1 True \n", "354364 -1.000000 0.0 0 True \n", "\n", " opt_in gender_female gender_male gender_other nb_campaigns \\\n", "0 True 1 0 0 13.0 \n", "1 True 0 0 1 10.0 \n", "2 True 0 1 0 14.0 \n", "3 False 0 0 1 9.0 \n", "4 False 0 0 1 4.0 \n", "... ... ... ... ... ... \n", "354360 False 0 0 1 7.0 \n", "354361 True 0 1 0 11.0 \n", "354362 False 1 0 0 6.0 \n", "354363 True 0 1 0 3.0 \n", "354364 False 0 1 0 7.0 \n", "\n", " nb_campaigns_opened \n", "0 4.0 \n", "1 9.0 \n", "2 0.0 \n", "3 0.0 \n", "4 0.0 \n", "... ... \n", "354360 0.0 \n", "354361 2.0 \n", "354362 6.0 \n", "354363 0.0 \n", "354364 0.0 \n", "\n", "[354365 rows x 17 columns]" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }