{ "cells": [ { "cell_type": "markdown", "id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2", "metadata": {}, "source": [ "# Segmentation des clients par régression logistique" ] }, { "cell_type": "code", "execution_count": 1, "id": "bca785be-39f7-4583-9bd8-67c1134ae275", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "from sklearn.preprocessing import StandardScaler\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "id": "3bf57816-b023-4e84-9450-095620bddebc", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 3, "id": "27002f2f-a78a-414c-8e4f-b15bf6dd9e40", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_543/1677066092.py:7: DtypeWarning: Columns (40) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", "/tmp/ipykernel_543/1677066092.py:12: DtypeWarning: Columns (40) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "# Importation des données\n", "BUCKET = \"projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach\"\n", "\n", "FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_train.csv\"\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_test.csv\"\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "c3928b55-8821-46da-b3b5-a036efd6d2cf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
event_type_idname_event_types
02.0offre muséale individuel
14.0spectacle vivant
25.0offre muséale groupe
3NaNNaN
\n", "
" ], "text/plain": [ " event_type_id name_event_types\n", "0 2.0 offre muséale individuel\n", "1 4.0 spectacle vivant\n", "2 5.0 offre muséale groupe\n", "3 NaN NaN" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train[['event_type_id', 'name_event_types']].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 5, "id": "7e8a9d4d-7e55-4173-a7f4-8b8baa9610d2", "metadata": {}, "outputs": [], "source": [ "#Choose type of event \n", "type_event_choosed = 5\n", "\n", "dataset_test = dataset_test[(dataset_test['event_type_id'] == type_event_choosed) | np.isnan(dataset_test['event_type_id'])]\n", "dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", "dataset_train = dataset_train[(dataset_train['event_type_id'] == type_event_choosed) | np.isnan(dataset_train['event_type_id'])]\n", "dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)" ] }, { "cell_type": "code", "execution_count": 6, "id": "e20ced8f-df1c-43bb-8d15-79f414c8225c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "customer_id 0.000000\n", "event_type_id 0.950522\n", "nb_tickets 0.000000\n", "nb_purchases 0.000000\n", "total_amount 0.000000\n", "nb_suppliers 0.000000\n", "vente_internet_max 0.000000\n", "purchase_date_min 0.950522\n", "purchase_date_max 0.950522\n", "time_between_purchase 0.950522\n", "nb_tickets_internet 0.000000\n", "name_event_types 0.950522\n", "avg_amount 0.950522\n", "street_id 0.000000\n", "is_partner 0.000000\n", "gender 0.000000\n", "is_email_true 0.000000\n", "opt_in 0.000000\n", "structure_id 0.863048\n", "mcp_contact_id 0.297275\n", "last_buying_date 0.611718\n", "max_price 0.611718\n", "ticket_sum 0.000000\n", "average_price 0.102225\n", "fidelity 0.000000\n", "average_purchase_delay 0.611718\n", "average_price_basket 0.611718\n", "average_ticket_basket 0.611718\n", "total_price 0.509493\n", "purchase_count 0.000000\n", "first_buying_date 0.611718\n", "country 0.063488\n", "tenant_id 0.000000\n", "gender_label 0.000000\n", "gender_female 0.000000\n", "gender_male 0.000000\n", "gender_other 0.000000\n", "country_fr 0.063488\n", "nb_campaigns 0.000000\n", "nb_campaigns_opened 0.000000\n", "time_to_open 0.543355\n", "y_has_purchased 0.000000\n", "dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train.isna().sum()/len(dataset_train)" ] }, { "cell_type": "code", "execution_count": 22, "id": "05e29adb-7eef-416f-8f7b-248229eee0fe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "nb_tickets 0\n", "nb_purchases 0\n", "total_amount 0\n", "nb_suppliers 0\n", "vente_internet_max 0\n", "nb_tickets_internet 0\n", "opt_in 0\n", "fidelity 0\n", "nb_campaigns 0\n", "nb_campaigns_opened 0\n", "dtype: int64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']].isna().sum()\n", "# pas de NaN, OK !" ] }, { "cell_type": "code", "execution_count": 7, "id": "2ce94258-e2d1-472a-81fc-fc11e247b423", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "161.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train['y_has_purchased'].sum()" ] }, { "cell_type": "code", "execution_count": 8, "id": "34bae3f7-d579-4f80-a38d-a83eb5ea8a7b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9985491193310349\n", "Confusion Matrix:\n", " [[127988 49]\n", " [ 137 24]]\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0.0 1.00 1.00 1.00 128037\n", " 1.0 0.33 0.15 0.21 161\n", "\n", " accuracy 1.00 128198\n", " macro avg 0.66 0.57 0.60 128198\n", "weighted avg 1.00 1.00 1.00 128198\n", "\n" ] } ], "source": [ "\n", "reg_columns = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", "X_train = dataset_train[reg_columns]\n", "y_train = dataset_train['y_has_purchased']\n", "X_test = dataset_test[reg_columns]\n", "y_test = dataset_test['y_has_purchased']\n", "\n", "# Fit and transform the scaler on the training data\n", "scaler = StandardScaler()\n", "\n", "# Transform the test data using the same scaler\n", "X_train_scaled = scaler.fit_transform(X_train)\n", "X_test_scaled = scaler.fit_transform(X_test)\n", "\n", "# Create and fit the linear regression model\n", "logit_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)\n", "logit_model.fit(X_train_scaled, y_train)\n", "\n", "y_pred = logit_model.predict(X_test_scaled)\n", "\n", "#Evaluation du modèle \n", "accuracy = accuracy_score(y_test, y_pred)\n", "conf_matrix = confusion_matrix(y_test, y_pred)\n", "class_report = classification_report(y_test, y_pred)\n", "\n", "print(\"Accuracy:\", accuracy)\n", "print(\"Confusion Matrix:\\n\", conf_matrix)\n", "print(\"Classification Report:\\n\", class_report)" ] }, { "cell_type": "code", "execution_count": 9, "id": "ccc78c36-3287-46e6-89ac-7494c1a7106a", "metadata": { "scrolled": true }, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n", "plt.xlabel('Predicted')\n", "plt.ylabel('Actual')\n", "plt.title('Confusion Matrix')\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "fe6e14d2-001d-4585-9344-f240b84ce4af", "metadata": {}, "source": [ "## Ajout TP : test d'une nouvelle pipeline" ] }, { "cell_type": "code", "execution_count": 54, "id": "9d19f8c0-ed31-46cd-8879-47810fa099d6", "metadata": {}, "outputs": [], "source": [ "# definition des variables utilisées\n", "\n", "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n", "# categorical_features = [\"opt_in\"]\n", "encoded_features = [\"opt_in\", \"vente_internet_max\"]\n", "features = numeric_features + encoded_features\n", "X_train = dataset_train[features]\n", "y_train = dataset_train['y_has_purchased']\n", "X_test = dataset_test[features]\n", "y_test = dataset_test['y_has_purchased']" ] }, { "cell_type": "code", "execution_count": 55, "id": "412ddfad-3d20-4fa0-afaa-79ec87b3122d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 128198.000000\n", "mean 2.924687\n", "std 923.990506\n", "min 0.000000\n", "25% 0.000000\n", "50% 0.000000\n", "75% 1.000000\n", "max 330831.000000\n", "Name: fidelity, dtype: float64" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "### variable fidelity\n", "\n", "X_train[\"fidelity\"].describe() # sûrement un problème d'outlier pour fidelity\n", "# X_train[\"total_amount\"].describe()" ] }, { "cell_type": "code", "execution_count": 56, "id": "97e1cd25-0961-45dd-af7f-78ab1d8088ee", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersnb_tickets_internetfidelitynb_campaignsnb_campaigns_openedopt_invente_internet_max
2122983.065082.0878762.55.09.03308310.00.001.0
30.00.00.00.00.01732.00.010.0
152773.081.032338.02.02.094126.050.011.0
240.00.00.00.00.022421.00.010.0
28282.015.00.02.053.03432.013.011.0
.................................
14490514.08.0136.02.00.091.00.010.0
144906190.0120.01757.02.00.01201.00.010.0
144950761.0418.05406.52.00.09071.00.010.0
14495911.08.0103.01.00.081.00.010.0
1506860.00.00.00.00.060.00.010.0
\n", "

287 rows × 10 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "2 122983.0 65082.0 878762.5 5.0 \n", "3 0.0 0.0 0.0 0.0 \n", "15 2773.0 81.0 32338.0 2.0 \n", "24 0.0 0.0 0.0 0.0 \n", "28 282.0 15.0 0.0 2.0 \n", "... ... ... ... ... \n", "144905 14.0 8.0 136.0 2.0 \n", "144906 190.0 120.0 1757.0 2.0 \n", "144950 761.0 418.0 5406.5 2.0 \n", "144959 11.0 8.0 103.0 1.0 \n", "150686 0.0 0.0 0.0 0.0 \n", "\n", " nb_tickets_internet fidelity nb_campaigns nb_campaigns_opened \\\n", "2 9.0 330831 0.0 0.0 \n", "3 0.0 173 2.0 0.0 \n", "15 2.0 94 126.0 50.0 \n", "24 0.0 224 21.0 0.0 \n", "28 53.0 34 32.0 13.0 \n", "... ... ... ... ... \n", "144905 0.0 9 1.0 0.0 \n", "144906 0.0 120 1.0 0.0 \n", "144950 0.0 907 1.0 0.0 \n", "144959 0.0 8 1.0 0.0 \n", "150686 0.0 6 0.0 0.0 \n", "\n", " opt_in vente_internet_max \n", "2 0 1.0 \n", "3 1 0.0 \n", "15 1 1.0 \n", "24 1 0.0 \n", "28 1 1.0 \n", "... ... ... \n", "144905 1 0.0 \n", "144906 1 0.0 \n", "144950 1 0.0 \n", "144959 1 0.0 \n", "150686 1 0.0 \n", "\n", "[287 rows x 10 columns]" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[X_train[\"fidelity\"]>5]" ] }, { "cell_type": "code", "execution_count": 57, "id": "fc17957e-b684-41cd-880f-049a4ffcc7dc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idevent_type_idnb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchase...tenant_idgender_labelgender_femalegender_malegender_othercountry_frnb_campaignsnb_campaigns_openedtime_to_openy_has_purchased
215.0122983.065082.0878762.55.01.0267.4377890.23066267.20713...1311other0011.00.00.0NaN1.0
32NaN0.00.00.00.00.0NaNNaNNaN...1311male0101.02.00.0NaN0.0
65NaN0.00.00.00.00.0NaNNaNNaN...1311male0101.02.00.0NaN0.0
76NaN0.00.00.00.00.0NaNNaNNaN...1311male0101.012.00.0NaN0.0
87NaN0.00.00.00.00.0NaNNaNNaN...1311female1001.024.010.05 days 11:58:520.0
..................................................................
1526451256102NaN0.00.00.00.00.0NaNNaNNaN...1311female1001.00.00.0NaN0.0
1526461256103NaN0.00.00.00.00.0NaNNaNNaN...1311other001NaN0.00.0NaN0.0
1526471256104NaN0.00.00.00.00.0NaNNaNNaN...1311other001NaN0.00.0NaN0.0
1526481256105NaN0.00.00.00.00.0NaNNaNNaN...1311other001NaN0.00.0NaN0.0
1526491256106NaN0.00.00.00.00.0NaNNaNNaN...1311other001NaN0.00.0NaN0.0
\n", "

128198 rows × 42 columns

\n", "
" ], "text/plain": [ " customer_id event_type_id nb_tickets nb_purchases total_amount \\\n", "2 1 5.0 122983.0 65082.0 878762.5 \n", "3 2 NaN 0.0 0.0 0.0 \n", "6 5 NaN 0.0 0.0 0.0 \n", "7 6 NaN 0.0 0.0 0.0 \n", "8 7 NaN 0.0 0.0 0.0 \n", "... ... ... ... ... ... \n", "152645 1256102 NaN 0.0 0.0 0.0 \n", "152646 1256103 NaN 0.0 0.0 0.0 \n", "152647 1256104 NaN 0.0 0.0 0.0 \n", "152648 1256105 NaN 0.0 0.0 0.0 \n", "152649 1256106 NaN 0.0 0.0 0.0 \n", "\n", " nb_suppliers vente_internet_max purchase_date_min \\\n", "2 5.0 1.0 267.437789 \n", "3 0.0 0.0 NaN \n", "6 0.0 0.0 NaN \n", "7 0.0 0.0 NaN \n", "8 0.0 0.0 NaN \n", "... ... ... ... \n", "152645 0.0 0.0 NaN \n", "152646 0.0 0.0 NaN \n", "152647 0.0 0.0 NaN \n", "152648 0.0 0.0 NaN \n", "152649 0.0 0.0 NaN \n", "\n", " purchase_date_max time_between_purchase ... tenant_id gender_label \\\n", "2 0.23066 267.20713 ... 1311 other \n", "3 NaN NaN ... 1311 male \n", "6 NaN NaN ... 1311 male \n", "7 NaN NaN ... 1311 male \n", "8 NaN NaN ... 1311 female \n", "... ... ... ... ... ... \n", "152645 NaN NaN ... 1311 female \n", "152646 NaN NaN ... 1311 other \n", "152647 NaN NaN ... 1311 other \n", "152648 NaN NaN ... 1311 other \n", "152649 NaN NaN ... 1311 other \n", "\n", " gender_female gender_male gender_other country_fr nb_campaigns \\\n", "2 0 0 1 1.0 0.0 \n", "3 0 1 0 1.0 2.0 \n", "6 0 1 0 1.0 2.0 \n", "7 0 1 0 1.0 12.0 \n", "8 1 0 0 1.0 24.0 \n", "... ... ... ... ... ... \n", "152645 1 0 0 1.0 0.0 \n", "152646 0 0 1 NaN 0.0 \n", "152647 0 0 1 NaN 0.0 \n", "152648 0 0 1 NaN 0.0 \n", "152649 0 0 1 NaN 0.0 \n", "\n", " nb_campaigns_opened time_to_open y_has_purchased \n", "2 0.0 NaN 1.0 \n", "3 0.0 NaN 0.0 \n", "6 0.0 NaN 0.0 \n", "7 0.0 NaN 0.0 \n", "8 10.0 5 days 11:58:52 0.0 \n", "... ... ... ... \n", "152645 0.0 NaN 0.0 \n", "152646 0.0 NaN 0.0 \n", "152647 0.0 NaN 0.0 \n", "152648 0.0 NaN 0.0 \n", "152649 0.0 NaN 0.0 \n", "\n", "[128198 rows x 42 columns]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# on transforme opt_in en indicatrice\n", "\n", "dataset_train[\"opt_in\"] = dataset_train[\"opt_in\"].astype(int)\n", "dataset_train" ] }, { "cell_type": "code", "execution_count": 58, "id": "8ad69b5d-e2e2-4d70-b8f0-ea0d37f7fe0c", "metadata": {}, "outputs": [], "source": [ "# definition des variables utilisées\n", "\n", "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n", "# categorical_features = [\"opt_in\"]\n", "encoded_features = [\"opt_in\", \"vente_internet_max\"]\n", "features = numeric_features + encoded_features\n", "X_train = dataset_train[features]\n", "y_train = dataset_train['y_has_purchased']\n", "X_test = dataset_test[features]\n", "y_test = dataset_test['y_has_purchased']" ] }, { "cell_type": "code", "execution_count": 52, "id": "f4772d69-4f92-434a-a617-b7df4dcd106e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersnb_tickets_internetfidelitynb_campaignsnb_campaigns_openedopt_invente_internet_max
2122983.065082.0878762.55.09.03308310.00.001.0
30.00.00.00.00.01732.00.010.0
60.00.00.00.00.012.00.010.0
70.00.00.00.00.0112.00.010.0
80.00.00.00.00.0124.010.010.0
.................................
1526450.00.00.00.00.000.00.000.0
1526460.00.00.00.00.000.00.000.0
1526470.00.00.00.00.000.00.000.0
1526480.00.00.00.00.000.00.000.0
1526490.00.00.00.00.000.00.000.0
\n", "

128198 rows × 10 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "2 122983.0 65082.0 878762.5 5.0 \n", "3 0.0 0.0 0.0 0.0 \n", "6 0.0 0.0 0.0 0.0 \n", "7 0.0 0.0 0.0 0.0 \n", "8 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "152645 0.0 0.0 0.0 0.0 \n", "152646 0.0 0.0 0.0 0.0 \n", "152647 0.0 0.0 0.0 0.0 \n", "152648 0.0 0.0 0.0 0.0 \n", "152649 0.0 0.0 0.0 0.0 \n", "\n", " nb_tickets_internet fidelity nb_campaigns nb_campaigns_opened \\\n", "2 9.0 330831 0.0 0.0 \n", "3 0.0 173 2.0 0.0 \n", "6 0.0 1 2.0 0.0 \n", "7 0.0 1 12.0 0.0 \n", "8 0.0 1 24.0 10.0 \n", "... ... ... ... ... \n", "152645 0.0 0 0.0 0.0 \n", "152646 0.0 0 0.0 0.0 \n", "152647 0.0 0 0.0 0.0 \n", "152648 0.0 0 0.0 0.0 \n", "152649 0.0 0 0.0 0.0 \n", "\n", " opt_in vente_internet_max \n", "2 0 1.0 \n", "3 1 0.0 \n", "6 1 0.0 \n", "7 1 0.0 \n", "8 1 0.0 \n", "... ... ... \n", "152645 0 0.0 \n", "152646 0 0.0 \n", "152647 0 0.0 \n", "152648 0 0.0 \n", "152649 0 0.0 \n", "\n", "[128198 rows x 10 columns]" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[\"vente_internet_max\"].unique() # indicatrice ?\n", "X_train" ] }, { "cell_type": "markdown", "id": "3ed647a6-db9a-4737-b819-57cb81691ea2", "metadata": {}, "source": [ "### Autre ajout : travail de preprocessing des données - étude des outliers" ] }, { "cell_type": "code", "execution_count": 60, "id": "3771eeb1-5221-44e5-a5cd-15475fbe4858", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 128198.000000\n", "mean 0.582536\n", "std 181.774597\n", "min 0.000000\n", "25% 0.000000\n", "50% 0.000000\n", "75% 0.000000\n", "max 65082.000000\n", "Name: nb_purchases, dtype: float64" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 1. number of purchases\n", "\n", "X_train[\"nb_purchases\"].describe()" ] }, { "cell_type": "code", "execution_count": 84, "id": "63c44b80-88cd-4339-91b9-3764e2690316", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersnb_tickets_internetfidelitynb_campaignsnb_campaigns_openedopt_invente_internet_max
2122983.065082.0878762.55.09.03308310.00.001.0
152773.081.032338.02.02.094126.050.011.0
28282.015.00.02.053.03432.013.011.0
2940.02.00.01.00.0424.017.010.0
3152.02.00.01.00.0522.06.010.0
.................................
1471554.02.044.01.04.020.00.001.0
1472423.02.040.01.03.020.00.001.0
14741412.02.0132.01.012.020.00.001.0
14763615.02.0165.01.015.020.00.001.0
1479502.02.029.01.02.020.00.001.0
\n", "

747 rows × 10 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "2 122983.0 65082.0 878762.5 5.0 \n", "15 2773.0 81.0 32338.0 2.0 \n", "28 282.0 15.0 0.0 2.0 \n", "29 40.0 2.0 0.0 1.0 \n", "31 52.0 2.0 0.0 1.0 \n", "... ... ... ... ... \n", "147155 4.0 2.0 44.0 1.0 \n", "147242 3.0 2.0 40.0 1.0 \n", "147414 12.0 2.0 132.0 1.0 \n", "147636 15.0 2.0 165.0 1.0 \n", "147950 2.0 2.0 29.0 1.0 \n", "\n", " nb_tickets_internet fidelity nb_campaigns nb_campaigns_opened \\\n", "2 9.0 330831 0.0 0.0 \n", "15 2.0 94 126.0 50.0 \n", "28 53.0 34 32.0 13.0 \n", "29 0.0 4 24.0 17.0 \n", "31 0.0 5 22.0 6.0 \n", "... ... ... ... ... \n", "147155 4.0 2 0.0 0.0 \n", "147242 3.0 2 0.0 0.0 \n", "147414 12.0 2 0.0 0.0 \n", "147636 15.0 2 0.0 0.0 \n", "147950 2.0 2 0.0 0.0 \n", "\n", " opt_in vente_internet_max \n", "2 0 1.0 \n", "15 1 1.0 \n", "28 1 1.0 \n", "29 1 0.0 \n", "31 1 0.0 \n", "... ... ... \n", "147155 0 1.0 \n", "147242 0 1.0 \n", "147414 0 1.0 \n", "147636 0 1.0 \n", "147950 0 1.0 \n", "\n", "[747 rows x 10 columns]" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train[X_train[\"nb_purchases\"]>1]" ] }, { "cell_type": "code", "execution_count": 65, "id": "032fbc5a-9044-41bd-b992-78077a6c8432", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1.0" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.quantile(X_train[\"nb_purchases\"], 0.99)" ] }, { "cell_type": "code", "execution_count": 70, "id": "cad9f7cb-8b71-49a6-874b-e15cb9d7a204", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "count 128198.000000\n", "mean 1.946941\n", "std 343.940117\n", "min 0.000000\n", "25% 0.000000\n", "50% 0.000000\n", "75% 0.000000\n", "max 122983.000000\n", "Name: nb_tickets, dtype: float64\n" ] }, { "data": { "text/plain": [ "23.0" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "### 2. nb tickets\n", "\n", "print(X_train[\"nb_tickets\"].describe())\n", "np.quantile(X_train[\"nb_tickets\"], 0.99)" ] }, { "cell_type": "code", "execution_count": 73, "id": "6bb0c86d-eb61-473d-a29b-c59e7e5af489", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "count 128198.000000\n", "mean 10.496193\n", "std 2457.094272\n", "min 0.000000\n", "25% 0.000000\n", "50% 0.000000\n", "75% 0.000000\n", "max 878762.500000\n", "Name: total_amount, dtype: float64\n" ] }, { "data": { "text/plain": [ "44.0" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 3. total amount\n", "\n", "print(X_train[\"total_amount\"].describe())\n", "np.quantile(X_train[\"total_amount\"], 0.99)" ] }, { "cell_type": "code", "execution_count": 76, "id": "ab6fded3-d8a5-4bb4-8f2d-472ea0e5e755", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "count 128198.000000\n", "mean 2.924687\n", "std 923.990506\n", "min 0.000000\n", "25% 0.000000\n", "50% 0.000000\n", "75% 1.000000\n", "max 330831.000000\n", "Name: fidelity, dtype: float64\n" ] }, { "data": { "text/plain": [ "2.0" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 4. fidelity\n", "\n", "print(X_train[\"fidelity\"].describe())\n", "np.quantile(X_train[\"fidelity\"], 0.99)" ] }, { "cell_type": "code", "execution_count": 79, "id": "c1f0ac75-71a4-43fb-844b-e006acf5927b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "count 128198.000000\n", "mean 24.276463\n", "std 37.899868\n", "min 0.000000\n", "25% 1.000000\n", "50% 4.000000\n", "75% 28.000000\n", "max 299.000000\n", "Name: nb_campaigns, dtype: float64\n" ] }, { "data": { "text/plain": [ "133.0" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 5. nb campaigns - semble pas aberrant meme si forte variance\n", "\n", "print(X_train[\"nb_campaigns\"].describe())\n", "np.quantile(X_train[\"nb_campaigns\"], 0.99)" ] }, { "cell_type": "code", "execution_count": 80, "id": "8bb01064-1c23-4100-ace8-56f155e0b4ab", "metadata": {}, "outputs": [], "source": [ "### on retire les outliers - variables : nb purchases, nb tickets, total amount, fidelity\n", "\n", "p99_nb_purchases = np.quantile(X_train[\"nb_purchases\"], 0.99)\n", "p99_nb_tickets = np.quantile(X_train[\"nb_tickets\"], 0.99)\n", "p99_total_amount = np.quantile(X_train[\"total_amount\"], 0.99)\n", "p99_fidelity = np.quantile(X_train[\"fidelity\"], 0.99)\n", "\n" ] }, { "cell_type": "code", "execution_count": 98, "id": "b2b43ab6-16aa-41bc-9a62-47ab769c5bf2", "metadata": {}, "outputs": [], "source": [ "# filtre - on enlève les valeurs aberrantes sur les variables problématiques (retire 2% des valeurs en tt)\n", "\n", "X_train = X_train.loc[(X_train[\"nb_purchases\"] <= p99_nb_purchases) &\n", "(X_train[\"nb_tickets\"] <= p99_nb_tickets) &\n", "(X_train[\"total_amount\"] <= p99_total_amount) &\n", "(X_train[\"fidelity\"] <= p99_fidelity)]" ] }, { "cell_type": "code", "execution_count": 99, "id": "b254a671-9e57-4123-ae65-55c852eb64cd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersnb_tickets_internetfidelitynb_campaignsnb_campaigns_openedopt_invente_internet_max
60.00.00.00.00.012.00.010.0
70.00.00.00.00.0112.00.010.0
80.00.00.00.00.0124.010.010.0
90.00.00.00.00.0114.07.010.0
100.00.00.00.00.0123.011.010.0
\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers nb_tickets_internet \\\n", "6 0.0 0.0 0.0 0.0 0.0 \n", "7 0.0 0.0 0.0 0.0 0.0 \n", "8 0.0 0.0 0.0 0.0 0.0 \n", "9 0.0 0.0 0.0 0.0 0.0 \n", "10 0.0 0.0 0.0 0.0 0.0 \n", "\n", " fidelity nb_campaigns nb_campaigns_opened opt_in vente_internet_max \n", "6 1 2.0 0.0 1 0.0 \n", "7 1 12.0 0.0 1 0.0 \n", "8 1 24.0 10.0 1 0.0 \n", "9 1 14.0 7.0 1 0.0 \n", "10 1 23.0 11.0 1 0.0 " ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": 101, "id": "86d90380-6ad2-4c6b-a103-53e4c1fa59e0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idevent_type_idnb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchase...tenant_idgender_labelgender_femalegender_malegender_othercountry_frnb_campaignsnb_campaigns_openedtime_to_openy_has_purchased
65NaN0.00.00.00.00.0NaNNaNNaN...1311male0101.02.00.0NaN0.0
76NaN0.00.00.00.00.0NaNNaNNaN...1311male0101.012.00.0NaN0.0
87NaN0.00.00.00.00.0NaNNaNNaN...1311female1001.024.010.05 days 11:58:520.0
98NaN0.00.00.00.00.0NaNNaNNaN...1311female1001.014.07.00 days 13:29:25.7142857140.0
109NaN0.00.00.00.00.0NaNNaNNaN...1311female1001.023.011.00 days 17:17:44.0909090900.0
..................................................................
1526451256102NaN0.00.00.00.00.0NaNNaNNaN...1311female1001.00.00.0NaN0.0
1526461256103NaN0.00.00.00.00.0NaNNaNNaN...1311other001NaN0.00.0NaN0.0
1526471256104NaN0.00.00.00.00.0NaNNaNNaN...1311other001NaN0.00.0NaN0.0
1526481256105NaN0.00.00.00.00.0NaNNaNNaN...1311other001NaN0.00.0NaN0.0
1526491256106NaN0.00.00.00.00.0NaNNaNNaN...1311other001NaN0.00.0NaN0.0
\n", "

125792 rows × 42 columns

\n", "
" ], "text/plain": [ " customer_id event_type_id nb_tickets nb_purchases total_amount \\\n", "6 5 NaN 0.0 0.0 0.0 \n", "7 6 NaN 0.0 0.0 0.0 \n", "8 7 NaN 0.0 0.0 0.0 \n", "9 8 NaN 0.0 0.0 0.0 \n", "10 9 NaN 0.0 0.0 0.0 \n", "... ... ... ... ... ... \n", "152645 1256102 NaN 0.0 0.0 0.0 \n", "152646 1256103 NaN 0.0 0.0 0.0 \n", "152647 1256104 NaN 0.0 0.0 0.0 \n", "152648 1256105 NaN 0.0 0.0 0.0 \n", "152649 1256106 NaN 0.0 0.0 0.0 \n", "\n", " nb_suppliers vente_internet_max purchase_date_min \\\n", "6 0.0 0.0 NaN \n", "7 0.0 0.0 NaN \n", "8 0.0 0.0 NaN \n", "9 0.0 0.0 NaN \n", "10 0.0 0.0 NaN \n", "... ... ... ... \n", "152645 0.0 0.0 NaN \n", "152646 0.0 0.0 NaN \n", "152647 0.0 0.0 NaN \n", "152648 0.0 0.0 NaN \n", "152649 0.0 0.0 NaN \n", "\n", " purchase_date_max time_between_purchase ... tenant_id gender_label \\\n", "6 NaN NaN ... 1311 male \n", "7 NaN NaN ... 1311 male \n", "8 NaN NaN ... 1311 female \n", "9 NaN NaN ... 1311 female \n", "10 NaN NaN ... 1311 female \n", "... ... ... ... ... ... \n", "152645 NaN NaN ... 1311 female \n", "152646 NaN NaN ... 1311 other \n", "152647 NaN NaN ... 1311 other \n", "152648 NaN NaN ... 1311 other \n", "152649 NaN NaN ... 1311 other \n", "\n", " gender_female gender_male gender_other country_fr nb_campaigns \\\n", "6 0 1 0 1.0 2.0 \n", "7 0 1 0 1.0 12.0 \n", "8 1 0 0 1.0 24.0 \n", "9 1 0 0 1.0 14.0 \n", "10 1 0 0 1.0 23.0 \n", "... ... ... ... ... ... \n", "152645 1 0 0 1.0 0.0 \n", "152646 0 0 1 NaN 0.0 \n", "152647 0 0 1 NaN 0.0 \n", "152648 0 0 1 NaN 0.0 \n", "152649 0 0 1 NaN 0.0 \n", "\n", " nb_campaigns_opened time_to_open y_has_purchased \n", "6 0.0 NaN 0.0 \n", "7 0.0 NaN 0.0 \n", "8 10.0 5 days 11:58:52 0.0 \n", "9 7.0 0 days 13:29:25.714285714 0.0 \n", "10 11.0 0 days 17:17:44.090909090 0.0 \n", "... ... ... ... \n", "152645 0.0 NaN 0.0 \n", "152646 0.0 NaN 0.0 \n", "152647 0.0 NaN 0.0 \n", "152648 0.0 NaN 0.0 \n", "152649 0.0 NaN 0.0 \n", "\n", "[125792 rows x 42 columns]" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "\n", "dataset_train = dataset_train.loc[(dataset_train[\"nb_purchases\"] <= p99_nb_purchases) &\n", "(dataset_train[\"nb_tickets\"] <= p99_nb_tickets) &\n", "(dataset_train[\"total_amount\"] <= p99_total_amount) &\n", "(dataset_train[\"fidelity\"] <= p99_fidelity)]\n", "\n", "dataset_train" ] }, { "cell_type": "markdown", "id": "f9487c48-b973-4d9e-abb9-902800ab778f", "metadata": {}, "source": [ "En enlevant les outliers, on supprime la plupart des clients ayant acheté à nouveau ... Il faut trouver un autre moyen de preprocessing qui ne dégrade pas le dataset" ] }, { "cell_type": "code", "execution_count": 102, "id": "9fe7513b-f23b-4bee-957d-f98919d6eb30", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "19.0" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train[\"y_has_purchased\"].sum() # pb : on passe de 161 à 19 clients ayant acheté ..." ] }, { "cell_type": "markdown", "id": "b531aebb-3b2f-4c62-ae01-84bdf8e45f49", "metadata": {}, "source": [ "### Construction de la pipeline pour le modèle de régression logistique et résultats" ] }, { "cell_type": "code", "execution_count": 28, "id": "1476da0d-cbb5-46ac-9f97-10855eec0108", "metadata": {}, "outputs": [], "source": [ "# importations pr créer la pipeline\n", "\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score" ] }, { "cell_type": "code", "execution_count": 37, "id": "f905cb6f-b0be-4a47-ac8d-7b3e16ff1dce", "metadata": {}, "outputs": [], "source": [ "# debut de la pipeline\n", "numeric_transformer = Pipeline(steps=[\n", " (\"imputer\", SimpleImputer(strategy=\"constant\", fill_value=0)), # to deal with missing numeric data\n", " (\"scaler\", StandardScaler())])\n", "\"\"\"\n", "categorical_transformer = Pipeline(steps=[\n", " (\"imputer\", SimpleImputer(strategy=\"constant\", fill_value=\"Not defined\")),\n", " (\"onehot\", OneHotEncoder(handle_unknown='ignore'))]) # to deal with missing categorical data\n", "\n", "\"\"\"\n", "preproc = ColumnTransformer(transformers=[(\"num\", numeric_transformer, numeric_features)])\n", "\n" ] }, { "cell_type": "code", "execution_count": 47, "id": "d322fb8f-1e97-4a44-96ca-c0f5d7ebd383", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'mean_fit_time': array([0.96252505, 0.94368339, 0.86111466, 0.95700479, 1.13914712,\n", " 1.10024587, 1.36208344, 1.40060536, 1.56624715, 1.76191751,\n", " 1.80110041, 1.67103759, 1.67045991, 1.62652612, 1.55767004,\n", " 2.12057273, 1.5302736 ]), 'std_fit_time': array([0.29823822, 0.30278155, 0.12007493, 0.33517543, 0.09216138,\n", " 0.2062314 , 0.04610438, 0.07753108, 0.39466125, 0.2932071 ,\n", " 0.43120394, 0.50535675, 0.33981082, 0.32312159, 0.20155337,\n", " 0.30868145, 0.39882192]), 'mean_score_time': array([0.23762226, 0.23680433, 0.20498077, 0.17563709, 0.19579411,\n", " 0.19613473, 0.17243052, 0.19963646, 0.19995451, 0.23090204,\n", " 0.20571589, 0.22864676, 0.23558458, 0.20300301, 0.20324389,\n", " 0.20974334, 0.20976925]), 'std_score_time': array([0.03395174, 0.04038772, 0.00520132, 0.03606735, 0.00459383,\n", " 0.00303405, 0.03295904, 0.00485894, 0.00429703, 0.0379754 ,\n", " 0.06829149, 0.03898822, 0.04453358, 0.00527175, 0.00196754,\n", " 0.0057154 , 0.00853897]), 'param_logreg__C': masked_array(data=[0.0009765625, 0.001953125, 0.00390625, 0.0078125,\n", " 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5, 1.0, 2.0,\n", " 4.0, 8.0, 16.0, 32.0, 64.0],\n", " mask=[False, False, False, False, False, False, False, False,\n", " False, False, False, False, False, False, False, False,\n", " False],\n", " fill_value='?',\n", " dtype=object), 'param_logreg__class_weight': masked_array(data=['balanced', 'balanced', 'balanced', 'balanced',\n", " 'balanced', 'balanced', 'balanced', 'balanced',\n", " 'balanced', 'balanced', 'balanced', 'balanced',\n", " 'balanced', 'balanced', 'balanced', 'balanced',\n", " 'balanced'],\n", " mask=[False, False, False, False, False, False, False, False,\n", " False, False, False, False, False, False, False, False,\n", " False],\n", " fill_value='?',\n", " dtype=object), 'params': [{'logreg__C': 0.0009765625, 'logreg__class_weight': 'balanced'}, {'logreg__C': 0.001953125, 'logreg__class_weight': 'balanced'}, {'logreg__C': 0.00390625, 'logreg__class_weight': 'balanced'}, {'logreg__C': 0.0078125, 'logreg__class_weight': 'balanced'}, {'logreg__C': 0.015625, 'logreg__class_weight': 'balanced'}, {'logreg__C': 0.03125, 'logreg__class_weight': 'balanced'}, {'logreg__C': 0.0625, 'logreg__class_weight': 'balanced'}, {'logreg__C': 0.125, 'logreg__class_weight': 'balanced'}, {'logreg__C': 0.25, 'logreg__class_weight': 'balanced'}, {'logreg__C': 0.5, 'logreg__class_weight': 'balanced'}, {'logreg__C': 1.0, 'logreg__class_weight': 'balanced'}, {'logreg__C': 2.0, 'logreg__class_weight': 'balanced'}, {'logreg__C': 4.0, 'logreg__class_weight': 'balanced'}, {'logreg__C': 8.0, 'logreg__class_weight': 'balanced'}, {'logreg__C': 16.0, 'logreg__class_weight': 'balanced'}, {'logreg__C': 32.0, 'logreg__class_weight': 'balanced'}, {'logreg__C': 64.0, 'logreg__class_weight': 'balanced'}], 'split0_test_score': array([0.99373228, 0.99368542, 0.9935917 , 0.99363856, 0.9935917 ,\n", " 0.99339253, 0.99260761, 0.99278334, 0.99386115, 0.99476323,\n", " 0.99500926, 0.99506783, 0.99511469, 0.99512641, 0.99518499,\n", " 0.99513812, 0.9951967 ]), 'split1_test_score': array([0.996872, 0.996872, 0.996872, 0.996872, 0.996872, 0.996872,\n", " 0.996872, 0.996872, 0.996872, 0.996872, 0.996872, 0.996872,\n", " 0.996872, 0.996872, 0.996872, 0.996872, 0.996872]), 'split2_test_score': array([0.91805656, 0.91806827, 0.91806827, 0.91806827, 0.91806827,\n", " 0.91806827, 0.91806827, 0.91806827, 0.91806827, 0.91806827,\n", " 0.91806827, 0.91806827, 0.91806827, 0.91806827, 0.91806827,\n", " 0.91806827, 0.91806827]), 'mean_test_score': array([0.96955361, 0.9695419 , 0.96951066, 0.96952628, 0.96951066,\n", " 0.96944427, 0.96918263, 0.9692412 , 0.96960047, 0.96990117,\n", " 0.96998318, 0.9700027 , 0.97001832, 0.97002223, 0.97004175,\n", " 0.97002613, 0.97004566]), 'std_test_score': array([0.03643647, 0.03642059, 0.0363999 , 0.03641024, 0.0363999 ,\n", " 0.03635608, 0.03618521, 0.0362232 , 0.03645949, 0.0366615 ,\n", " 0.03671726, 0.03673057, 0.03674124, 0.0367439 , 0.03675725,\n", " 0.03674657, 0.03675992]), 'rank_test_score': array([10, 11, 13, 12, 13, 15, 17, 16, 9, 8, 7, 6, 5, 4, 2, 3, 1],\n", " dtype=int32)}\n", "Returned hyperparameter: {'logreg__C': 64.0, 'logreg__class_weight': 'balanced'}\n", "Best classification accuracy in train is: 0.9700456574978843\n", "Classification accuracy on test is: 0.9758585408905238\n" ] } ], "source": [ "# on doit prendre une métrique adaptée aux datasets déséquilibrés\n", "balanced_scorer = make_scorer(balanced_accuracy_score)\n", "\n", "parameter_space = np.logspace(-10, 6, 17, base=2)\n", "\n", "pipe = Pipeline([('preprocessor', preproc), ('logreg', LogisticRegression(max_iter=5000))])\n", "# on met plus de poids sur les observations rares (utile pr gérer le déséquilibre du dataset)\n", "parameters4 = {'logreg__C': parameter_space, 'logreg__class_weight': ['balanced']} \n", "clf4 = GridSearchCV(pipe, parameters4, cv=3, scoring = balanced_scorer)\n", "clf4.fit(X_train, y_train)\n", "\n", "# print results\n", "# print(clf4.cv_results_)\n", "print('Returned hyperparameter: {}'.format(clf4.best_params_))\n", "print('Best classification accuracy in train is: {}'.format(clf4.best_score_))\n", "print('Classification accuracy on test is: {}'.format(clf4.score(X_test, y_test)))" ] }, { "cell_type": "code", "execution_count": 48, "id": "b32bb668-c816-4055-b786-e548eb71f318", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9517777188411676\n", "Confusion Matrix:\n", " [[121855 6182]\n", " [ 0 161]]\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0.0 1.00 0.95 0.98 128037\n", " 1.0 0.03 1.00 0.05 161\n", "\n", " accuracy 0.95 128198\n", " macro avg 0.51 0.98 0.51 128198\n", "weighted avg 1.00 0.95 0.97 128198\n", "\n" ] } ], "source": [ "# visualisation des résultats \n", "\n", "y_pred = clf4.predict(X_test)\n", "\n", "#Evaluation du modèle \n", "accuracy = accuracy_score(y_test, y_pred)\n", "conf_matrix = confusion_matrix(y_test, y_pred)\n", "class_report = classification_report(y_test, y_pred)\n", "\n", "print(\"Accuracy:\", accuracy)\n", "print(\"Confusion Matrix:\\n\", conf_matrix)\n", "print(\"Classification Report:\\n\", class_report)\n" ] }, { "cell_type": "code", "execution_count": 49, "id": "faebbecb-3f85-4181-8005-2f52180fa37e", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# matrice de confusion\n", "\n", "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n", "plt.xlabel('Predicted')\n", "plt.ylabel('Actual')\n", "plt.title('Confusion Matrix')\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }