375 lines
39 KiB
Plaintext
375 lines
39 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Segmentation des clients par régression logistique"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "bca785be-39f7-4583-9bd8-67c1134ae275",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import os\n",
|
|
"import s3fs\n",
|
|
"import re\n",
|
|
"from sklearn.linear_model import LogisticRegression\n",
|
|
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
|
|
"from sklearn.preprocessing import StandardScaler\n",
|
|
"import seaborn as sns\n",
|
|
"import matplotlib.pyplot as plt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "3bf57816-b023-4e84-9450-095620bddebc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create filesystem object\n",
|
|
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
|
|
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "27002f2f-a78a-414c-8e4f-b15bf6dd9e40",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/tmp/ipykernel_23374/1677066092.py:7: DtypeWarning: Columns (11,40) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
|
"/tmp/ipykernel_23374/1677066092.py:12: DtypeWarning: Columns (40) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
|
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Importation des données\n",
|
|
"BUCKET = \"projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach\"\n",
|
|
"\n",
|
|
"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_train.csv\"\n",
|
|
"\n",
|
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
|
"\n",
|
|
"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_test.csv\"\n",
|
|
"\n",
|
|
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
|
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "c3928b55-8821-46da-b3b5-a036efd6d2cf",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>event_type_id</th>\n",
|
|
" <th>name_event_types</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>2.0</td>\n",
|
|
" <td>offre muséale individuel</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>4.0</td>\n",
|
|
" <td>spectacle vivant</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>5.0</td>\n",
|
|
" <td>offre muséale groupe</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" event_type_id name_event_types\n",
|
|
"0 2.0 offre muséale individuel\n",
|
|
"1 4.0 spectacle vivant\n",
|
|
"2 5.0 offre muséale groupe\n",
|
|
"3 NaN NaN"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"dataset_train[['event_type_id', 'name_event_types']].drop_duplicates()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "7e8a9d4d-7e55-4173-a7f4-8b8baa9610d2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Choose type of event \n",
|
|
"type_event_choosed = 5\n",
|
|
"\n",
|
|
"dataset_test = dataset_test[(dataset_test['event_type_id'] == type_event_choosed) | np.isnan(dataset_test['event_type_id'])]\n",
|
|
"dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
|
|
"dataset_train = dataset_train[(dataset_train['event_type_id'] == type_event_choosed) | np.isnan(dataset_train['event_type_id'])]\n",
|
|
"dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "b4078b8e-2172-47e6-9f92-106dc3015fc9",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"228.0"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"dataset_train['y_has_purchased'].sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "e20ced8f-df1c-43bb-8d15-79f414c8225c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"customer_id 0.000000\n",
|
|
"event_type_id 0.967882\n",
|
|
"nb_tickets 0.000000\n",
|
|
"nb_purchases 0.000000\n",
|
|
"total_amount 0.000000\n",
|
|
"nb_suppliers 0.000000\n",
|
|
"vente_internet_max 0.000000\n",
|
|
"purchase_date_min 0.967882\n",
|
|
"purchase_date_max 0.967882\n",
|
|
"time_between_purchase 0.967882\n",
|
|
"nb_tickets_internet 0.000000\n",
|
|
"name_event_types 0.967882\n",
|
|
"avg_amount 0.967882\n",
|
|
"street_id 0.000000\n",
|
|
"is_partner 0.000000\n",
|
|
"gender 0.000000\n",
|
|
"is_email_true 0.000000\n",
|
|
"opt_in 0.000000\n",
|
|
"structure_id 0.856471\n",
|
|
"mcp_contact_id 0.297844\n",
|
|
"last_buying_date 0.642312\n",
|
|
"max_price 0.642312\n",
|
|
"ticket_sum 0.000000\n",
|
|
"average_price 0.107403\n",
|
|
"fidelity 0.000000\n",
|
|
"average_purchase_delay 0.642312\n",
|
|
"average_price_basket 0.642312\n",
|
|
"average_ticket_basket 0.642312\n",
|
|
"total_price 0.534909\n",
|
|
"purchase_count 0.000000\n",
|
|
"first_buying_date 0.642312\n",
|
|
"country 0.066622\n",
|
|
"tenant_id 0.000000\n",
|
|
"gender_label 0.000000\n",
|
|
"gender_female 0.000000\n",
|
|
"gender_male 0.000000\n",
|
|
"gender_other 0.000000\n",
|
|
"country_fr 0.066622\n",
|
|
"nb_campaigns 0.000000\n",
|
|
"nb_campaigns_opened 0.000000\n",
|
|
"time_to_open 0.553988\n",
|
|
"y_has_purchased 0.000000\n",
|
|
"dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"dataset_train.isna().sum()/len(dataset_train)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "2ce94258-e2d1-472a-81fc-fc11e247b423",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"121789.0"
|
|
]
|
|
},
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"len(dataset_train) - dataset_train['y_has_purchased'].sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "34bae3f7-d579-4f80-a38d-a83eb5ea8a7b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Accuracy: 0.9986037223669636\n",
|
|
"Confusion Matrix:\n",
|
|
" [[128000 37]\n",
|
|
" [ 142 19]]\n",
|
|
"Classification Report:\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0.0 1.00 1.00 1.00 128037\n",
|
|
" 1.0 0.34 0.12 0.18 161\n",
|
|
"\n",
|
|
" accuracy 1.00 128198\n",
|
|
" macro avg 0.67 0.56 0.59 128198\n",
|
|
"weighted avg 1.00 1.00 1.00 128198\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"reg_columns = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",
|
|
"\n",
|
|
"X_train = dataset_train[reg_columns]\n",
|
|
"y_train = dataset_train['y_has_purchased']\n",
|
|
"X_test = dataset_test[reg_columns]\n",
|
|
"y_test = dataset_test['y_has_purchased']\n",
|
|
"\n",
|
|
"# Fit and transform the scaler on the training data\n",
|
|
"scaler = StandardScaler()\n",
|
|
"\n",
|
|
"# Transform the test data using the same scaler\n",
|
|
"X_train_scaled = scaler.fit_transform(X_train)\n",
|
|
"X_test_scaled = scaler.fit_transform(X_test)\n",
|
|
"\n",
|
|
"# Create and fit the linear regression model\n",
|
|
"logit_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)\n",
|
|
"logit_model.fit(X_train_scaled, y_train)\n",
|
|
"\n",
|
|
"y_pred = logit_model.predict(X_test_scaled)\n",
|
|
"\n",
|
|
"#Evaluation du modèle \n",
|
|
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
|
|
"class_report = classification_report(y_test, y_pred)\n",
|
|
"\n",
|
|
"print(\"Accuracy:\", accuracy)\n",
|
|
"print(\"Confusion Matrix:\\n\", conf_matrix)\n",
|
|
"print(\"Classification Report:\\n\", class_report)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "ccc78c36-3287-46e6-89ac-7494c1a7106a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 2 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
|
|
"plt.xlabel('Predicted')\n",
|
|
"plt.ylabel('Actual')\n",
|
|
"plt.title('Confusion Matrix')\n",
|
|
"plt.show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|