BDC-team-1/2_Regression_logistique.ipynb

358 lines
40 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "markdown",
"id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2",
"metadata": {},
"source": [
"# Segmentation des clients par régression logistique"
]
2024-02-12 23:49:13 +01:00
},
{
"cell_type": "code",
"execution_count": 1,
"id": "bca785be-39f7-4583-9bd8-67c1134ae275",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
"from sklearn.preprocessing import StandardScaler\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "59ce5096-4e2c-45c1-be78-43e14db4142c",
"metadata": {},
"outputs": [],
"source": [
"# # modification des variables categorielles\n",
" \n",
"# ### variable gender\n",
"# df1_customer_product[\"gender_label\"] = df1_customer_product[\"gender\"].map({\n",
"# 0: 'female',\n",
"# 1: 'male',\n",
"# 2: 'other'\n",
"# })\n",
" \n",
"# ### variable country -> on indique si le pays est france\n",
"# df1_customer_product[\"country_fr\"] = df1_customer_product[\"country\"].apply(lambda x : int(x==\"fr\") if pd.notna(x) else np.nan)\n",
"\n",
"# # Création des indicatrices de gender\n",
"# gender_dummies = pd.get_dummies(df1_customer_product[\"gender_label\"], prefix='gender').astype(int)\n",
" \n",
"# # Concaténation des indicatrices avec le dataframe d'origine\n",
"# df1_customer_product = pd.concat([df1_customer_product, gender_dummies], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3bf57816-b023-4e84-9450-095620bddebc",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "27002f2f-a78a-414c-8e4f-b15bf6dd9e40",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-02-13 00:00:09 +01:00
"/tmp/ipykernel_7740/1677066092.py:7: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2024-02-12 23:49:13 +01:00
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
2024-02-13 00:00:09 +01:00
"/tmp/ipykernel_7740/1677066092.py:12: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n",
2024-02-12 23:49:13 +01:00
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
]
}
],
"source": [
"# Importation des données\n",
"BUCKET = \"projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach\"\n",
"\n",
"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_train.csv\"\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_test.csv\"\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c3928b55-8821-46da-b3b5-a036efd6d2cf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>event_type_id</th>\n",
" <th>name_event_types</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.0</td>\n",
" <td>offre muséale individuel</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.0</td>\n",
" <td>spectacle vivant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5.0</td>\n",
" <td>offre muséale groupe</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" event_type_id name_event_types\n",
"0 2.0 offre muséale individuel\n",
"1 4.0 spectacle vivant\n",
"2 5.0 offre muséale groupe\n",
"3 NaN NaN"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_train[['event_type_id', 'name_event_types']].drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "7e8a9d4d-7e55-4173-a7f4-8b8baa9610d2",
"metadata": {},
"outputs": [],
"source": [
"#Choose type of event \n",
"type_event_choosed = 5\n",
"\n",
"dataset_test = dataset_test[(dataset_test['event_type_id'] == type_event_choosed) | np.isnan(dataset_test['event_type_id'])]\n",
"dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
"dataset_train = dataset_train[(dataset_train['event_type_id'] == type_event_choosed) | np.isnan(dataset_train['event_type_id'])]\n",
"dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)"
]
},
{
"cell_type": "code",
2024-02-13 00:00:09 +01:00
"execution_count": 7,
2024-02-12 23:49:13 +01:00
"id": "e20ced8f-df1c-43bb-8d15-79f414c8225c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"customer_id 0.000000\n",
2024-02-13 00:00:09 +01:00
"event_type_id 0.950522\n",
2024-02-12 23:49:13 +01:00
"nb_tickets 0.000000\n",
"nb_purchases 0.000000\n",
"total_amount 0.000000\n",
"nb_suppliers 0.000000\n",
"vente_internet_max 0.000000\n",
2024-02-13 00:00:09 +01:00
"purchase_date_min 0.950522\n",
"purchase_date_max 0.950522\n",
"time_between_purchase 0.950522\n",
2024-02-12 23:49:13 +01:00
"nb_tickets_internet 0.000000\n",
2024-02-13 00:00:09 +01:00
"name_event_types 0.950522\n",
"avg_amount 0.950522\n",
"birthdate 0.961918\n",
2024-02-12 23:49:13 +01:00
"street_id 0.000000\n",
"is_partner 0.000000\n",
"gender 0.000000\n",
"is_email_true 0.000000\n",
"opt_in 0.000000\n",
2024-02-13 00:00:09 +01:00
"structure_id 0.863048\n",
"profession 0.952160\n",
"language 0.991778\n",
"mcp_contact_id 0.297275\n",
"last_buying_date 0.611718\n",
"max_price 0.611718\n",
2024-02-12 23:49:13 +01:00
"ticket_sum 0.000000\n",
2024-02-13 00:00:09 +01:00
"average_price 0.102225\n",
2024-02-12 23:49:13 +01:00
"fidelity 0.000000\n",
2024-02-13 00:00:09 +01:00
"average_purchase_delay 0.611718\n",
"average_price_basket 0.611718\n",
"average_ticket_basket 0.611718\n",
"total_price 0.509493\n",
2024-02-12 23:49:13 +01:00
"purchase_count 0.000000\n",
2024-02-13 00:00:09 +01:00
"first_buying_date 0.611718\n",
"country 0.063488\n",
"age 0.961918\n",
2024-02-12 23:49:13 +01:00
"tenant_id 0.000000\n",
"nb_campaigns 0.000000\n",
"nb_campaigns_opened 0.000000\n",
2024-02-13 00:00:09 +01:00
"time_to_open 0.543355\n",
2024-02-12 23:49:13 +01:00
"y_has_purchased 0.000000\n",
"dtype: float64"
]
},
2024-02-13 00:00:09 +01:00
"execution_count": 7,
2024-02-12 23:49:13 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_train.isna().sum()/len(dataset_train)"
]
},
{
"cell_type": "code",
2024-02-13 00:00:09 +01:00
"execution_count": 8,
2024-02-12 23:49:13 +01:00
"id": "34bae3f7-d579-4f80-a38d-a83eb5ea8a7b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-02-13 00:00:09 +01:00
"Accuracy: 0.9985491193310349\n",
2024-02-12 23:49:13 +01:00
"Confusion Matrix:\n",
2024-02-13 00:00:09 +01:00
" [[127988 49]\n",
" [ 137 24]]\n",
2024-02-12 23:49:13 +01:00
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
2024-02-13 00:00:09 +01:00
" 0.0 1.00 1.00 1.00 128037\n",
" 1.0 0.33 0.15 0.21 161\n",
2024-02-12 23:49:13 +01:00
"\n",
2024-02-13 00:00:09 +01:00
" accuracy 1.00 128198\n",
" macro avg 0.66 0.57 0.60 128198\n",
"weighted avg 1.00 1.00 1.00 128198\n",
2024-02-12 23:49:13 +01:00
"\n"
]
}
],
"source": [
"\n",
"reg_columns = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",
"\n",
"X_train = dataset_train[reg_columns]\n",
"y_train = dataset_train['y_has_purchased']\n",
"X_test = dataset_test[reg_columns]\n",
"y_test = dataset_test['y_has_purchased']\n",
"\n",
"# Fit and transform the scaler on the training data\n",
"scaler = StandardScaler()\n",
"\n",
"# Transform the test data using the same scaler\n",
"X_train_scaled = scaler.fit_transform(X_train)\n",
"X_test_scaled = scaler.fit_transform(X_test)\n",
"\n",
"# Create and fit the linear regression model\n",
"logit_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)\n",
"logit_model.fit(X_train_scaled, y_train)\n",
"\n",
"y_pred = logit_model.predict(X_test_scaled)\n",
"\n",
"#Evaluation du modèle \n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"class_report = classification_report(y_test, y_pred)\n",
"\n",
"print(\"Accuracy:\", accuracy)\n",
"print(\"Confusion Matrix:\\n\", conf_matrix)\n",
"print(\"Classification Report:\\n\", class_report)"
]
},
{
"cell_type": "code",
2024-02-13 00:00:09 +01:00
"execution_count": 9,
2024-02-12 23:49:13 +01:00
"id": "ccc78c36-3287-46e6-89ac-7494c1a7106a",
"metadata": {},
"outputs": [
{
"data": {
2024-02-13 00:00:09 +01:00
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjYAAAHFCAYAAADhWLMfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABUp0lEQVR4nO3dfVyN9/8H8Nfp7qjo6EblkPvWJBsyCXMz5K7iu+2LZdFmMaE1uVkzt9vK3RcjcjM3m7G2ITM3LdPGGpFoijBETEducpBUO12/P/xc21FWca7rcLye38f1eDjX9T7X9T7HN733/nw+16UQBEEAERERkQkwM3YCRERERIbCwoaIiIhMBgsbIiIiMhksbIiIiMhksLAhIiIik8HChoiIiEwGCxsiIiIyGSxsiIiIyGSwsCEiIiKTwcKGTNrRo0fx1ltvoXHjxqhRowZq1qyJNm3aYM6cObh+/bqk1z5y5Ai6dOkClUoFhUKBhQsXGvwaCoUC06dPN/h5K7N27VooFAooFAr88ssv5Y4LgoBmzZpBoVCga9euj3SNpUuXYu3atdV6zy+//PLQnIjo2WBh7ASIpLJy5UqEhYXBw8MDEyZMgKenJ0pLS3Ho0CEsW7YM+/fvR0JCgmTXf/vtt1FYWIj4+HjY29ujUaNGBr/G/v37Ub9+fYOft6pq1aqFVatWlSte9uzZgzNnzqBWrVqPfO6lS5fCyckJISEhVX5PmzZtsH//fnh6ej7ydYno6cbChkzS/v37MWrUKPTs2RNbtmyBUqkUj/Xs2RORkZFITEyUNIesrCyEhoaiT58+kl2jffv2kp27KgYNGoT169djyZIlsLOzE/evWrUKvr6+uHnzpix5lJaWQqFQwM7OzujfCREZF4eiyCRFR0dDoVBgxYoVekXNfVZWVggMDBRfl5WVYc6cOXj++eehVCrh7OyMoUOH4uLFi3rv69q1K7y8vJCWloaXX34ZNjY2aNKkCWbNmoWysjIAfw/T/PXXX4iLixOHbABg+vTp4p//6f57zp07J+5LTk5G165d4ejoCGtrazRo0ACvvfYa7ty5I8ZUNBSVlZWF/v37w97eHjVq1ECrVq3wxRdf6MXcH7L5+uuvMXnyZKjVatjZ2aFHjx44efJk1b5kAG+88QYA4Ouvvxb3abVabNq0CW+//XaF75kxYwZ8fHzg4OAAOzs7tGnTBqtWrcI/n8fbqFEjHDt2DHv27BG/v/sdr/u5r1u3DpGRkahXrx6USiVOnz5dbijq6tWrcHNzQ4cOHVBaWiqe//jx47C1tUVwcHCVPysRPR1Y2JDJ0el0SE5Ohre3N9zc3Kr0nlGjRmHSpEno2bMntm7dio8//hiJiYno0KEDrl69qher0WgwZMgQvPnmm9i6dSv69OmDqKgofPXVVwCAfv36Yf/+/QCA119/Hfv37xdfV9W5c+fQr18/WFlZYfXq1UhMTMSsWbNga2uLkpKSh77v5MmT6NChA44dO4ZFixZh8+bN8PT0REhICObMmVMu/sMPP8T58+fx+eefY8WKFfjjjz8QEBAAnU5XpTzt7Ozw+uuvY/Xq1eK+r7/+GmZmZhg0aNBDP9vIkSPx7bffYvPmzXj11VcxduxYfPzxx2JMQkICmjRpgtatW4vf34PDhlFRUcjNzcWyZcvwww8/wNnZudy1nJycEB8fj7S0NEyaNAkAcOfOHfz3v/9FgwYNsGzZsip9TiJ6ighEJkaj0QgAhMGDB1cpPjs7WwAghIWF6e0/cOCAAED48MMPxX1dunQRAAgHDhzQi/X09BR69eqltw+AMHr0aL1906ZNEyr6sVuzZo0AQMjJyREEQRA2btwoABAyMjL+NXcAwrRp08TXgwcPFpRKpZCbm6sX16dPH8HGxka4ceOGIAiC8PPPPwsAhL59++rFffvttwIAYf/+/f963fv5pqWliefKysoSBEEQXnrpJSEkJEQQBEFo0aKF0KVLl4eeR6fTCaWlpcLMmTMFR0dHoaysTDz2sPfev17nzp0feuznn3/W2z979mwBgJCQkCAMGzZMsLa2Fo4ePfqvn5GInk7s2NAz7+effwaAcpNU27Vrh+bNm2P37t16+11dXdGuXTu9fS+88ALOnz9vsJxatWoFKysrjBgxAl988QXOnj1bpfclJyeje/fu5TpVISEhuHPnTrnO0T+H44B7nwNAtT5Lly5d0LRpU6xevRqZmZlIS0t76DDU/Rx79OgBlUoFc3NzWFpaYurUqbh27Rry8/OrfN3XXnutyrETJkxAv3798MYbb+CLL77A4sWL0bJlyyq/n4ieHixsyOQ4OTnBxsYGOTk5VYq/du0aAKBu3brljqnVavH4fY6OjuXilEolioqKHiHbijVt2hQ//fQTnJ2dMXr0aDRt2hRNmzbFZ5999q/vu3bt2kM/x/3j//TgZ7k/H6k6n0WhUOCtt97CV199hWXLluG5557Dyy+/XGHswYMH4efnB+DeqrXffvsNaWlpmDx5crWvW9Hn/LccQ0JCcPfuXbi6unJuDZEJY2FDJsfc3Bzdu3dHenp6ucm/Fbn/yz0vL6/csUuXLsHJyclgudWoUQMAUFxcrLf/wXk8APDyyy/jhx9+gFarRWpqKnx9fREREYH4+PiHnt/R0fGhnwOAQT/LP4WEhODq1atYtmwZ3nrrrYfGxcfHw9LSEtu2bcPAgQPRoUMHtG3b9pGuWdEk7IfJy8vD6NGj0apVK1y7dg3jx49/pGsS0ZOPhQ2ZpKioKAiCgNDQ0Aon25aWluKHH34AALzyyisAIE7+vS8tLQ3Z2dno3r27wfK6v7Ln6NGjevvv51IRc3Nz+Pj4YMmSJQCAw4cPPzS2e/fuSE5OFguZ+7788kvY2NhIthS6Xr16mDBhAgICAjBs2LCHxikUClhYWMDc3FzcV1RUhHXr1pWLNVQXTKfT4Y033oBCocDOnTsRExODxYsXY/PmzY99biJ68vA+NmSSfH19ERcXh7CwMHh7e2PUqFFo0aIFSktLceTIEaxYsQJeXl4ICAiAh4cHRowYgcWLF8PMzAx9+vTBuXPnMGXKFLi5ueH99983WF59+/aFg4MDhg8fjpkzZ8LCwgJr167FhQsX9OKWLVuG5ORk9OvXDw0aNMDdu3fFlUc9evR46PmnTZuGbdu2oVu3bpg6dSocHBywfv16bN++HXPmzIFKpTLYZ3nQrFmzKo3p168f5s+fj6CgIIwYMQLXrl3DvHnzKlyS37JlS8THx+Obb75BkyZNUKNGjUeaFzNt2jT8+uuvSEpKgqurKyIjI7Fnzx4MHz4crVu3RuPGjat9TiJ6crGwIZMVGhqKdu3aYcGCBZg9ezY0Gg0sLS3x3HPPISgoCGPGjBFj4+Li0LRpU6xatQpLliyBSqVC7969ERMTU+GcmkdlZ2eHxMRERERE4M0330Tt2rXxzjvvoE+fPnjnnXfEuFatWiEpKQnTpk2DRqNBzZo14eXlha1bt4pzVCri4eGBffv24cMPP8To0aNRVFSE5s2bY82aNdW6g69UXnnlFaxevRqzZ89GQEAA6tWrh9DQUDg7O2P48OF6sTNmzEBeXh5CQ0Nx69YtNGzYUO8+P1Wxa9cuxMTEYMqUKXqdt7Vr16J169YYNGgQUlJSYGVlZYiPR0RPAIUg/OOuWERERERPMc6xISIiIpPBwoaIiIhMBgsbIiIiMhksbIiIiMhksLAhIiIik8HChoiIiEwGCxsiIiIyGSZ5gz7r1mMqDyJ6BhWkxRo7BaInTg0ZfhMa6vdS0RH+DFeGHRsiIiIyGSbZsSEiInqiKNhHkAsLGyIiIqkpFMbO4JnBwoaIiEhq7NjIht80ERERmQx2bIiIiKTGoSjZsLAhIiKSGoeiZMNvmoiIiEwGOzZERERS41CUbFjYEBERSY1DUbLhN01EREQmgx0bIiIiqXEoSjYsbIiIiKTGoSjZ8JsmIiIik8GODRERkdQ4FCUbFjZERERS41CUbFjYEBERSY0dG9mwhCQiIiKTwcKGiIhIagozw2zVtHfvXgQEBECtVkOhUGDLli3isdLSUkyaNAktW7aEra0t1Go
2024-02-12 23:49:13 +01:00
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('Actual')\n",
"plt.title('Confusion Matrix')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-02-12 23:49:13 +01:00
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}