{ "cells": [ { "cell_type": "markdown", "id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2", "metadata": {}, "source": [ "# Segmentation des clients par régression logistique" ] }, { "cell_type": "code", "execution_count": 1, "id": "bca785be-39f7-4583-9bd8-67c1134ae275", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "from sklearn.preprocessing import StandardScaler\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "id": "59ce5096-4e2c-45c1-be78-43e14db4142c", "metadata": {}, "outputs": [], "source": [ "# # modification des variables categorielles\n", " \n", "# ### variable gender\n", "# df1_customer_product[\"gender_label\"] = df1_customer_product[\"gender\"].map({\n", "# 0: 'female',\n", "# 1: 'male',\n", "# 2: 'other'\n", "# })\n", " \n", "# ### variable country -> on indique si le pays est france\n", "# df1_customer_product[\"country_fr\"] = df1_customer_product[\"country\"].apply(lambda x : int(x==\"fr\") if pd.notna(x) else np.nan)\n", "\n", "# # Création des indicatrices de gender\n", "# gender_dummies = pd.get_dummies(df1_customer_product[\"gender_label\"], prefix='gender').astype(int)\n", " \n", "# # Concaténation des indicatrices avec le dataframe d'origine\n", "# df1_customer_product = pd.concat([df1_customer_product, gender_dummies], axis=1)" ] }, { "cell_type": "code", "execution_count": 3, "id": "3bf57816-b023-4e84-9450-095620bddebc", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 4, "id": "27002f2f-a78a-414c-8e4f-b15bf6dd9e40", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7740/1677066092.py:7: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", "/tmp/ipykernel_7740/1677066092.py:12: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "# Importation des données\n", "BUCKET = \"projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach\"\n", "\n", "FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_train.csv\"\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_test.csv\"\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "c3928b55-8821-46da-b3b5-a036efd6d2cf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
event_type_idname_event_types
02.0offre muséale individuel
14.0spectacle vivant
25.0offre muséale groupe
3NaNNaN
\n", "
" ], "text/plain": [ " event_type_id name_event_types\n", "0 2.0 offre muséale individuel\n", "1 4.0 spectacle vivant\n", "2 5.0 offre muséale groupe\n", "3 NaN NaN" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train[['event_type_id', 'name_event_types']].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 6, "id": "7e8a9d4d-7e55-4173-a7f4-8b8baa9610d2", "metadata": {}, "outputs": [], "source": [ "#Choose type of event \n", "type_event_choosed = 5\n", "\n", "dataset_test = dataset_test[(dataset_test['event_type_id'] == type_event_choosed) | np.isnan(dataset_test['event_type_id'])]\n", "dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", "dataset_train = dataset_train[(dataset_train['event_type_id'] == type_event_choosed) | np.isnan(dataset_train['event_type_id'])]\n", "dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)" ] }, { "cell_type": "code", "execution_count": 7, "id": "e20ced8f-df1c-43bb-8d15-79f414c8225c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "customer_id 0.000000\n", "event_type_id 0.950522\n", "nb_tickets 0.000000\n", "nb_purchases 0.000000\n", "total_amount 0.000000\n", "nb_suppliers 0.000000\n", "vente_internet_max 0.000000\n", "purchase_date_min 0.950522\n", "purchase_date_max 0.950522\n", "time_between_purchase 0.950522\n", "nb_tickets_internet 0.000000\n", "name_event_types 0.950522\n", "avg_amount 0.950522\n", "birthdate 0.961918\n", "street_id 0.000000\n", "is_partner 0.000000\n", "gender 0.000000\n", "is_email_true 0.000000\n", "opt_in 0.000000\n", "structure_id 0.863048\n", "profession 0.952160\n", "language 0.991778\n", "mcp_contact_id 0.297275\n", "last_buying_date 0.611718\n", "max_price 0.611718\n", "ticket_sum 0.000000\n", "average_price 0.102225\n", "fidelity 0.000000\n", "average_purchase_delay 0.611718\n", "average_price_basket 0.611718\n", "average_ticket_basket 0.611718\n", "total_price 0.509493\n", "purchase_count 0.000000\n", "first_buying_date 0.611718\n", "country 0.063488\n", "age 0.961918\n", "tenant_id 0.000000\n", "nb_campaigns 0.000000\n", "nb_campaigns_opened 0.000000\n", "time_to_open 0.543355\n", "y_has_purchased 0.000000\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train.isna().sum()/len(dataset_train)" ] }, { "cell_type": "code", "execution_count": 8, "id": "34bae3f7-d579-4f80-a38d-a83eb5ea8a7b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9985491193310349\n", "Confusion Matrix:\n", " [[127988 49]\n", " [ 137 24]]\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0.0 1.00 1.00 1.00 128037\n", " 1.0 0.33 0.15 0.21 161\n", "\n", " accuracy 1.00 128198\n", " macro avg 0.66 0.57 0.60 128198\n", "weighted avg 1.00 1.00 1.00 128198\n", "\n" ] } ], "source": [ "\n", "reg_columns = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", "X_train = dataset_train[reg_columns]\n", "y_train = dataset_train['y_has_purchased']\n", "X_test = dataset_test[reg_columns]\n", "y_test = dataset_test['y_has_purchased']\n", "\n", "# Fit and transform the scaler on the training data\n", "scaler = StandardScaler()\n", "\n", "# Transform the test data using the same scaler\n", "X_train_scaled = scaler.fit_transform(X_train)\n", "X_test_scaled = scaler.fit_transform(X_test)\n", "\n", "# Create and fit the linear regression model\n", "logit_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)\n", "logit_model.fit(X_train_scaled, y_train)\n", "\n", "y_pred = logit_model.predict(X_test_scaled)\n", "\n", "#Evaluation du modèle \n", "accuracy = accuracy_score(y_test, y_pred)\n", "conf_matrix = confusion_matrix(y_test, y_pred)\n", "class_report = classification_report(y_test, y_pred)\n", "\n", "print(\"Accuracy:\", accuracy)\n", "print(\"Confusion Matrix:\\n\", conf_matrix)\n", "print(\"Classification Report:\\n\", class_report)" ] }, { "cell_type": "code", "execution_count": 9, "id": "ccc78c36-3287-46e6-89ac-7494c1a7106a", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n", "plt.xlabel('Predicted')\n", "plt.ylabel('Actual')\n", "plt.title('Confusion Matrix')\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }