BDC-team-1/useless/2_Regression_logistique.ipynb

375 lines
39 KiB
Plaintext
Raw Normal View History

2024-03-02 09:58:05 +01:00
{
"cells": [
{
"cell_type": "markdown",
"id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2",
"metadata": {},
"source": [
"# Segmentation des clients par régression logistique"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "bca785be-39f7-4583-9bd8-67c1134ae275",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
"from sklearn.preprocessing import StandardScaler\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3bf57816-b023-4e84-9450-095620bddebc",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "27002f2f-a78a-414c-8e4f-b15bf6dd9e40",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_23374/1677066092.py:7: DtypeWarning: Columns (11,40) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
"/tmp/ipykernel_23374/1677066092.py:12: DtypeWarning: Columns (40) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
]
}
],
"source": [
"# Importation des données\n",
"BUCKET = \"projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach\"\n",
"\n",
"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_train.csv\"\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_test.csv\"\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c3928b55-8821-46da-b3b5-a036efd6d2cf",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>event_type_id</th>\n",
" <th>name_event_types</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.0</td>\n",
" <td>offre muséale individuel</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4.0</td>\n",
" <td>spectacle vivant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5.0</td>\n",
" <td>offre muséale groupe</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" event_type_id name_event_types\n",
"0 2.0 offre muséale individuel\n",
"1 4.0 spectacle vivant\n",
"2 5.0 offre muséale groupe\n",
"3 NaN NaN"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_train[['event_type_id', 'name_event_types']].drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7e8a9d4d-7e55-4173-a7f4-8b8baa9610d2",
"metadata": {},
"outputs": [],
"source": [
"#Choose type of event \n",
"type_event_choosed = 5\n",
"\n",
"dataset_test = dataset_test[(dataset_test['event_type_id'] == type_event_choosed) | np.isnan(dataset_test['event_type_id'])]\n",
"dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
"dataset_train = dataset_train[(dataset_train['event_type_id'] == type_event_choosed) | np.isnan(dataset_train['event_type_id'])]\n",
"dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "b4078b8e-2172-47e6-9f92-106dc3015fc9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"228.0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_train['y_has_purchased'].sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e20ced8f-df1c-43bb-8d15-79f414c8225c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"customer_id 0.000000\n",
"event_type_id 0.967882\n",
"nb_tickets 0.000000\n",
"nb_purchases 0.000000\n",
"total_amount 0.000000\n",
"nb_suppliers 0.000000\n",
"vente_internet_max 0.000000\n",
"purchase_date_min 0.967882\n",
"purchase_date_max 0.967882\n",
"time_between_purchase 0.967882\n",
"nb_tickets_internet 0.000000\n",
"name_event_types 0.967882\n",
"avg_amount 0.967882\n",
"street_id 0.000000\n",
"is_partner 0.000000\n",
"gender 0.000000\n",
"is_email_true 0.000000\n",
"opt_in 0.000000\n",
"structure_id 0.856471\n",
"mcp_contact_id 0.297844\n",
"last_buying_date 0.642312\n",
"max_price 0.642312\n",
"ticket_sum 0.000000\n",
"average_price 0.107403\n",
"fidelity 0.000000\n",
"average_purchase_delay 0.642312\n",
"average_price_basket 0.642312\n",
"average_ticket_basket 0.642312\n",
"total_price 0.534909\n",
"purchase_count 0.000000\n",
"first_buying_date 0.642312\n",
"country 0.066622\n",
"tenant_id 0.000000\n",
"gender_label 0.000000\n",
"gender_female 0.000000\n",
"gender_male 0.000000\n",
"gender_other 0.000000\n",
"country_fr 0.066622\n",
"nb_campaigns 0.000000\n",
"nb_campaigns_opened 0.000000\n",
"time_to_open 0.553988\n",
"y_has_purchased 0.000000\n",
"dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_train.isna().sum()/len(dataset_train)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "2ce94258-e2d1-472a-81fc-fc11e247b423",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"121789.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dataset_train) - dataset_train['y_has_purchased'].sum()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "34bae3f7-d579-4f80-a38d-a83eb5ea8a7b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9986037223669636\n",
"Confusion Matrix:\n",
" [[128000 37]\n",
" [ 142 19]]\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0.0 1.00 1.00 1.00 128037\n",
" 1.0 0.34 0.12 0.18 161\n",
"\n",
" accuracy 1.00 128198\n",
" macro avg 0.67 0.56 0.59 128198\n",
"weighted avg 1.00 1.00 1.00 128198\n",
"\n"
]
}
],
"source": [
"\n",
"reg_columns = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",
"\n",
"X_train = dataset_train[reg_columns]\n",
"y_train = dataset_train['y_has_purchased']\n",
"X_test = dataset_test[reg_columns]\n",
"y_test = dataset_test['y_has_purchased']\n",
"\n",
"# Fit and transform the scaler on the training data\n",
"scaler = StandardScaler()\n",
"\n",
"# Transform the test data using the same scaler\n",
"X_train_scaled = scaler.fit_transform(X_train)\n",
"X_test_scaled = scaler.fit_transform(X_test)\n",
"\n",
"# Create and fit the linear regression model\n",
"logit_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)\n",
"logit_model.fit(X_train_scaled, y_train)\n",
"\n",
"y_pred = logit_model.predict(X_test_scaled)\n",
"\n",
"#Evaluation du modèle \n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"class_report = classification_report(y_test, y_pred)\n",
"\n",
"print(\"Accuracy:\", accuracy)\n",
"print(\"Confusion Matrix:\\n\", conf_matrix)\n",
"print(\"Classification Report:\\n\", class_report)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ccc78c36-3287-46e6-89ac-7494c1a7106a",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjYAAAHFCAYAAADhWLMfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABT1ElEQVR4nO3dfVyN9/8H8Nfp7qjU0Y3KIYQ0qY1lEpubodDN7A5ri8ZiQprcrG1otpXwxcj90GasGdpsaJk21ohEUxhDxJRCTkqq1fX7w8+1HWUV51wnx+v5fVyPh67rfa7rfU7fw3vvz+dzXTJBEAQQERER6QEDXSdAREREpCksbIiIiEhvsLAhIiIivcHChoiIiPQGCxsiIiLSGyxsiIiISG+wsCEiIiK9wcKGiIiI9AYLGyIiItIbLGxIrx07dgxvvvkmnJyc0KRJEzRt2hRPP/005s2bh+vXr2v12kePHkWfPn2gUCggk8mwePFijV9DJpMhKipK4+etS3x8PGQyGWQyGX755ZcaxwVBQIcOHSCTydC3b98Husby5csRHx/foNf88ssv982JiB4PRrpOgEhb1qxZg9DQULi4uGDatGlwdXVFZWUlDh8+jJUrV+LAgQNITEzU2vVHjx6N0tJSJCQkwMrKCm3bttX4NQ4cOIBWrVpp/Lz1ZWFhgbVr19YoXvbu3YuzZ8/CwsLigc+9fPly2NraIjg4uN6vefrpp3HgwAG4uro+8HWJ6NHGwob00oEDBzB+/HgMHDgQ3377LeRyuXhs4MCBiIiIQFJSklZzyM7ORkhICAYPHqy1a/To0UNr566P4cOHY+PGjVi2bBksLS3F/WvXroWXlxeKi4slyaOyshIymQyWlpY6/0yISLc4FEV6KTo6GjKZDKtXr1Yrau4yMTFBQECA+HN1dTXmzZuHJ554AnK5HHZ2dhg5ciQuXbqk9rq+ffvCzc0N6enpeO6552BmZoZ27dph7ty5qK6uBvDPMM3ff/+NFStWiEM2ABAVFSX++d/uvub8+fPivpSUFPTt2xc2NjYwNTVF69at8fLLL+PWrVtiTG1DUdnZ2XjhhRdgZWWFJk2aoEuXLvj888/VYu4O2Xz11Vd4//33oVQqYWlpiQEDBuDUqVP1+5ABvPbaawCAr776StynUqmwdetWjB49utbXfPjhh/D09IS1tTUsLS3x9NNPY+3atfj383jbtm2L48ePY+/eveLnd7fjdTf3DRs2ICIiAi1btoRcLseZM2dqDEVdvXoVjo6O6NmzJyorK8XznzhxAubm5ggKCqr3eyWiRwMLG9I7VVVVSElJgYeHBxwdHev1mvHjx2PGjBkYOHAgtm/fjo8++ghJSUno2bMnrl69qhabn5+P119/HW+88Qa2b9+OwYMHIzIyEl9++SUAwNfXFwcOHAAAvPLKKzhw4ID4c32dP38evr6+MDExwbp165CUlIS5c+fC3NwcFRUV933dqVOn0LNnTxw/fhxLlizBtm3b4OrqiuDgYMybN69G/HvvvYcLFy7gs88+w+rVq/Hnn3/C398fVVVV9crT0tISr7zyCtatWyfu++qrr2BgYIDhw4ff972NGzcOmzdvxrZt2/DSSy9h0qRJ+Oijj8SYxMREtGvXDl27dhU/v3uHDSMjI5Gbm4uVK1fi+++/h52dXY1r2draIiEhAenp6ZgxYwYA4NatW3j11VfRunVrrFy5sl7vk4geIQKRnsnPzxcACCNGjKhX/MmTJwUAQmhoqNr+gwcPCgCE9957T9zXp08fAYBw8OBBtVhXV1fBx8dHbR8AYcKECWr7Zs+eLdT2tVu/fr0AQMjJyREEQRC2bNkiABAyMzP/M3cAwuzZs8WfR4wYIcjlciE3N1ctbvDgwYKZmZlw48YNQRAE4eeffxYACEOGDFGL27x5swBAOHDgwH9e926+6enp4rmys7MFQRCEZ555RggODhYEQRA6d+4s9OnT577nqaqqEiorK4U5c+YINjY2QnV1tXjsfq+9e73evXvf99jPP/+stj82NlYAICQmJgqjRo0STE1NhWPHjv3neySiRxM7NvTY+/nnnwGgxiTV7t27o1OnTtizZ4/afgcHB3Tv3l1t35NPPokLFy5oLKcuXbrAxMQEY8eOxeeff45z587V63UpKSno379/jU5VcHAwbt26VaNz9O/hOODO+wDQoPfSp08ftG/fHuvWrUNWVhbS09PvOwx1N8cBAwZAoVDA0NAQxsbGmDVrFq5du4aCgoJ6X/fll1+ud+y0adPg6+uL1157DZ9//jmWLl0Kd3f3er+eiB4dLGxI79ja2sLMzAw5OTn1ir927RoAoEWLFjWOKZVK8fhdNjY2NeLkcjnKysoeINvatW/fHj/99BPs7OwwYcIEtG/fHu3bt8enn376n6+7du3afd/H3eP/du97uTsfqSHvRSaT4c0338SXX36JlStXomPHjnjuuedqjT106BC8vb0B3Fm19ttvvyE9PR3vv/9+g69b2/v8rxyDg4Nx+/ZtODg4cG4NkR5jYUN6x9DQEP3790dGRkaNyb+1ufuPe15eXo1jly9fhq2trcZya9KkCQCgvLxcbf+983gA4LnnnsP3338PlUqFtLQ0eHl5ITw8HAkJCfc9v42NzX3fBwCNvpd/Cw4OxtWrV7Fy5Uq8+eab941LSEiAsbExfvjhBwwbNgw9e/ZEt27dHuiatU3Cvp+8vDxMmDABXbp0wbVr1zB16tQHuiYRNX4sbEgvRUZGQhAEhISE1DrZtrKyEt9//z0A4PnnnwcAcfLvXenp6Th58iT69++vsbzuruw5duyY2v67udTG0NAQnp6eWLZsGQDgyJEj943t378/UlJSxELmri+++AJmZmZaWwrdsmVLTJs2Df7+/hg1atR942QyGYyMjGBoaCjuKysrw4YNG2rEaqoLVlVVhddeew0ymQy7du1CTEwMli5dim3btj30uYmo8eF9bEgveXl5YcWKFQgNDYWHhwfGjx+Pzp07o7KyEkePHsXq1avh5uYGf39/uLi4YOzYsVi6dCkMDAwwePBgnD9/HjNnzoSjoyPeeecdjeU1ZMgQWFtbY8yYMZgzZw6MjIwQHx+PixcvqsWtXLkSKSkp8PX1RevWrXH79m1x5dGAAQPue/7Zs2fjhx9+QL9+/TBr1ixYW1tj48aN2LFjB+bNmweFQqGx93KvuXPn1hnj6+uLhQsXIjAwEGPHjsW1a9ewYMGCWpfku7u7IyEhAV9//TXatWuHJk2aPNC8mNmzZ+PXX39FcnIyHBwcEBERgb1792LMmDHo2rUrnJycGnxOImq8WNiQ3goJCUH37t2xaNEixMbGIj8/H8bGxujYsSMCAwMxceJEMXbFihVo37491q5di2XLlkGhUGDQoEGIiYmpdU7Ng7K0tERSUhLCw8PxxhtvoFmzZnjrrbcwePBgvPXWW2Jcly5dkJycjNmzZyM/Px9NmzaFm5sbtm/fLs5RqY2Liwv279+P9957DxMmTEBZWRk6deqE9evXN+gOvtry/PPPY926dYiNjYW/vz9atmyJkJAQ2NnZYcyYMWqxH374IfLy8hASEoKbN2+iTZs2avf5qY/du3cjJiYGM2fOVOu8xcfHo2vXrhg+fDhSU1NhYmKiibdHRI2ATBD+dVcsIiIiokcY59gQERGR3mBhQ0RERHqDhQ0RERHpDRY2REREpDdY2BAREZHeYGFDREREeoOFDREREekNvbxBn2nXiXUHET2GitLjdJ0CUaPTRIJ/CTX171LZUX6H68KODREREekNvezYEBERNSoy9hGkwsKGiIhI22QyXWfw2GBhQ0REpG3s2EiGnzQRERHpDXZsiIiItI1DUZJhYUNERKRtHIqSDD9pIiIi0hvs2BAREWkbh6Ikw8KGiIhI2zgUJRl+0kRERKQ32LEhIiLSNg5FSYaFDRERkbZxKEoy/KSJiIhIb7BjQ0REpG0cipIMCxsiIiJt41CUZFjYEBERaRs7NpJhCUlERER6g4UNERGRtskMNLM10L59++Dv7w+lUgmZTIZvv/1WPFZZWYkZM2bA3d0d5ubmUCqVGDl
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
"plt.xlabel('Predicted')\n",
"plt.ylabel('Actual')\n",
"plt.title('Confusion Matrix')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}