BDC-team-1/2_Regression_logistique.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2",
   "metadata": {},
   "source": [
    "# Segmentation des clients par régression logistique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bca785be-39f7-4583-9bd8-67c1134ae275",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import s3fs\n",
    "import re\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "59ce5096-4e2c-45c1-be78-43e14db4142c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# # modification des variables categorielles\n",
    " \n",
    "# ### variable gender\n",
    "# df1_customer_product[\"gender_label\"] = df1_customer_product[\"gender\"].map({\n",
    "#     0: 'female',\n",
    "#     1: 'male',\n",
    "#     2: 'other'\n",
    "# })\n",
    " \n",
    "# ### variable country -> on indique si le pays est france\n",
    "# df1_customer_product[\"country_fr\"] = df1_customer_product[\"country\"].apply(lambda x : int(x==\"fr\") if pd.notna(x) else np.nan)\n",
    "\n",
    "# # Création des indicatrices de gender\n",
    "# gender_dummies = pd.get_dummies(df1_customer_product[\"gender_label\"], prefix='gender').astype(int)\n",
    " \n",
    "# # Concaténation des indicatrices avec le dataframe d'origine\n",
    "# df1_customer_product = pd.concat([df1_customer_product, gender_dummies], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3bf57816-b023-4e84-9450-095620bddebc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "27002f2f-a78a-414c-8e4f-b15bf6dd9e40",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_7740/1677066092.py:7: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  dataset_train = pd.read_csv(file_in, sep=\",\")\n",
      "/tmp/ipykernel_7740/1677066092.py:12: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  dataset_test = pd.read_csv(file_in, sep=\",\")\n"
     ]
    }
   ],
   "source": [
    "# Importation des données\n",
    "BUCKET = \"projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach\"\n",
    "\n",
    "FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_train.csv\"\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    dataset_train = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_test.csv\"\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    dataset_test = pd.read_csv(file_in, sep=\",\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c3928b55-8821-46da-b3b5-a036efd6d2cf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>event_type_id</th>\n",
       "      <th>name_event_types</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2.0</td>\n",
       "      <td>offre muséale individuel</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.0</td>\n",
       "      <td>spectacle vivant</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5.0</td>\n",
       "      <td>offre muséale groupe</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   event_type_id          name_event_types\n",
       "0            2.0  offre muséale individuel\n",
       "1            4.0          spectacle vivant\n",
       "2            5.0      offre muséale groupe\n",
       "3            NaN                       NaN"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_train[['event_type_id', 'name_event_types']].drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7e8a9d4d-7e55-4173-a7f4-8b8baa9610d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Choose type of event \n",
    "type_event_choosed = 5\n",
    "\n",
    "dataset_test = dataset_test[(dataset_test['event_type_id'] == type_event_choosed) | np.isnan(dataset_test['event_type_id'])]\n",
    "dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
    "dataset_train = dataset_train[(dataset_train['event_type_id'] == type_event_choosed) | np.isnan(dataset_train['event_type_id'])]\n",
    "dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e20ced8f-df1c-43bb-8d15-79f414c8225c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "customer_id               0.000000\n",
       "event_type_id             0.950522\n",
       "nb_tickets                0.000000\n",
       "nb_purchases              0.000000\n",
       "total_amount              0.000000\n",
       "nb_suppliers              0.000000\n",
       "vente_internet_max        0.000000\n",
       "purchase_date_min         0.950522\n",
       "purchase_date_max         0.950522\n",
       "time_between_purchase     0.950522\n",
       "nb_tickets_internet       0.000000\n",
       "name_event_types          0.950522\n",
       "avg_amount                0.950522\n",
       "birthdate                 0.961918\n",
       "street_id                 0.000000\n",
       "is_partner                0.000000\n",
       "gender                    0.000000\n",
       "is_email_true             0.000000\n",
       "opt_in                    0.000000\n",
       "structure_id              0.863048\n",
       "profession                0.952160\n",
       "language                  0.991778\n",
       "mcp_contact_id            0.297275\n",
       "last_buying_date          0.611718\n",
       "max_price                 0.611718\n",
       "ticket_sum                0.000000\n",
       "average_price             0.102225\n",
       "fidelity                  0.000000\n",
       "average_purchase_delay    0.611718\n",
       "average_price_basket      0.611718\n",
       "average_ticket_basket     0.611718\n",
       "total_price               0.509493\n",
       "purchase_count            0.000000\n",
       "first_buying_date         0.611718\n",
       "country                   0.063488\n",
       "age                       0.961918\n",
       "tenant_id                 0.000000\n",
       "nb_campaigns              0.000000\n",
       "nb_campaigns_opened       0.000000\n",
       "time_to_open              0.543355\n",
       "y_has_purchased           0.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_train.isna().sum()/len(dataset_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "34bae3f7-d579-4f80-a38d-a83eb5ea8a7b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.9985491193310349\n",
      "Confusion Matrix:\n",
      " [[127988     49]\n",
      " [   137     24]]\n",
      "Classification Report:\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "         0.0       1.00      1.00      1.00    128037\n",
      "         1.0       0.33      0.15      0.21       161\n",
      "\n",
      "    accuracy                           1.00    128198\n",
      "   macro avg       0.66      0.57      0.60    128198\n",
      "weighted avg       1.00      1.00      1.00    128198\n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "reg_columns = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",
    "\n",
    "X_train = dataset_train[reg_columns]\n",
    "y_train = dataset_train['y_has_purchased']\n",
    "X_test = dataset_test[reg_columns]\n",
    "y_test = dataset_test['y_has_purchased']\n",
    "\n",
    "# Fit and transform the scaler on the training data\n",
    "scaler = StandardScaler()\n",
    "\n",
    "# Transform the test data using the same scaler\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.fit_transform(X_test)\n",
    "\n",
    "# Create and fit the linear regression model\n",
    "logit_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)\n",
    "logit_model.fit(X_train_scaled, y_train)\n",
    "\n",
    "y_pred = logit_model.predict(X_test_scaled)\n",
    "\n",
    "#Evaluation du modèle \n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "conf_matrix = confusion_matrix(y_test, y_pred)\n",
    "class_report = classification_report(y_test, y_pred)\n",
    "\n",
    "print(\"Accuracy:\", accuracy)\n",
    "print(\"Confusion Matrix:\\n\", conf_matrix)\n",
    "print(\"Classification Report:\\n\", class_report)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ccc78c36-3287-46e6-89ac-7494c1a7106a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjYAAAHFCAYAAADhWLMfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABUp0lEQVR4nO3dfVyN9/8H8Nfp7qjo6EblkPvWJBsyCXMz5K7iu+2LZdFmMaE1uVkzt9vK3RcjcjM3m7G2ITM3LdPGGpFoijBETEducpBUO12/P/xc21FWca7rcLye38f1eDjX9T7X9T7HN733/nw+16UQBEEAERERkQkwM3YCRERERIbCwoaIiIhMBgsbIiIiMhksbIiIiMhksLAhIiIik8HChoiIiEwGCxsiIiIyGSxsiIiIyGSwsCEiIiKTwcKGTNrRo0fx1ltvoXHjxqhRowZq1qyJNm3aYM6cObh+/bqk1z5y5Ai6dOkClUoFhUKBhQsXGvwaCoUC06dPN/h5K7N27VooFAooFAr88ssv5Y4LgoBmzZpBoVCga9euj3SNpUuXYu3atdV6zy+//PLQnIjo2WBh7ASIpLJy5UqEhYXBw8MDEyZMgKenJ0pLS3Ho0CEsW7YM+/fvR0JCgmTXf/vtt1FYWIj4+HjY29ujUaNGBr/G/v37Ub9+fYOft6pq1aqFVatWlSte9uzZgzNnzqBWrVqPfO6lS5fCyckJISEhVX5PmzZtsH//fnh6ej7ydYno6cbChkzS/v37MWrUKPTs2RNbtmyBUqkUj/Xs2RORkZFITEyUNIesrCyEhoaiT58+kl2jffv2kp27KgYNGoT169djyZIlsLOzE/evWrUKvr6+uHnzpix5lJaWQqFQwM7OzujfCREZF4eiyCRFR0dDoVBgxYoVekXNfVZWVggMDBRfl5WVYc6cOXj++eehVCrh7OyMoUOH4uLFi3rv69q1K7y8vJCWloaXX34ZNjY2aNKkCWbNmoWysjIAfw/T/PXXX4iLixOHbABg+vTp4p//6f57zp07J+5LTk5G165d4ejoCGtrazRo0ACvvfYa7ty5I8ZUNBSVlZWF/v37w97eHjVq1ECrVq3wxRdf6MXcH7L5+uuvMXnyZKjVatjZ2aFHjx44efJk1b5kAG+88QYA4Ouvvxb3abVabNq0CW+//XaF75kxYwZ8fHzg4OAAOzs7tGnTBqtWrcI/n8fbqFEjHDt2DHv27BG/v/sdr/u5r1u3DpGRkahXrx6USiVOnz5dbijq6tWrcHNzQ4cOHVBaWiqe//jx47C1tUVwcHCVPysRPR1Y2JDJ0el0SE5Ohre3N9zc3Kr0nlGjRmHSpEno2bMntm7dio8//hiJiYno0KEDrl69qher0WgwZMgQvPnmm9i6dSv69OmDqKgofPXVVwCAfv36Yf/+/QCA119/Hfv37xdfV9W5c+fQr18/WFlZYfXq1UhMTMSsWbNga2uLkpKSh77v5MmT6NChA44dO4ZFixZh8+bN8PT0REhICObMmVMu/sMPP8T58+fx+eefY8WKFfjjjz8QEBAAnU5XpTzt7Ozw+uuvY/Xq1eK+r7/+GmZmZhg0aNBDP9vIkSPx7bffYvPmzXj11VcxduxYfPzxx2JMQkICmjRpgtatW4vf34PDhlFRUcjNzcWyZcvwww8/wNnZudy1nJycEB8fj7S0NEyaNAkAcOfOHfz3v/9FgwYNsGzZsip9TiJ6ighEJkaj0QgAhMGDB1cpPjs7WwAghIWF6e0/cOCAAED48MMPxX1dunQRAAgHDhzQi/X09BR69eqltw+AMHr0aL1906ZNEyr6sVuzZo0AQMjJyREEQRA2btwoABAyMjL+NXcAwrRp08TXgwcPFpRKpZCbm6sX16dPH8HGxka4ceOGIAiC8PPPPwsAhL59++rFffvttwIAYf/+/f963fv5pqWliefKysoSBEEQXnrpJSEkJEQQBEFo0aKF0KVLl4eeR6fTCaWlpcLMmTMFR0dHoaysTDz2sPfev17nzp0feuznn3/W2z979mwBgJCQkCAMGzZMsLa2Fo4ePfqvn5GInk7s2NAz7+effwaAcpNU27Vrh+bNm2P37t16+11dXdGuXTu9fS+88ALOnz9vsJxatWoFKysrjBgxAl988QXOnj1bpfclJyeje/fu5TpVISEhuHPnTrnO0T+H44B7nwNAtT5Lly5d0LRpU6xevRqZmZlIS0t76DDU/Rx79OgBlUoFc3NzWFpaYurUqbh27Rry8/OrfN3XXnutyrETJkxAv3798MYbb+CLL77A4sWL0bJlyyq/n4ieHixsyOQ4OTnBxsYGOTk5VYq/du0aAKBu3brljqnVavH4fY6OjuXilEolioqKHiHbijVt2hQ//fQTnJ2dMXr0aDRt2hRNmzbFZ5999q/vu3bt2kM/x/3j//TgZ7k/H6k6n0WhUOCtt97CV199hWXLluG5557Dyy+/XGHswYMH4efnB+DeqrXffvsNaWlpmDx5crWvW9Hn/LccQ0JCcPfuXbi6unJuDZEJY2FDJsfc3Bzdu3dHenp6ucm/Fbn/yz0vL6/csUuXLsHJyclgudWoUQMAUFxcrLf/wXk8APDyyy/jhx9+gFarRWpqKnx9fREREYH4+PiHnt/R0fGhnwOAQT/LP4WEhODq1atYtmwZ3nrrrYfGxcfHw9LSEtu2bcPAgQPRoUMHtG3b9pGuWdEk7IfJy8vD6NGj0apVK1y7dg3jx49/pGsS0ZOPhQ2ZpKioKAiCgNDQ0Aon25aWluKHH34AALzyyisAIE7+vS8tLQ3Z2dno3r27wfK6v7Ln6NGjevvv51IRc3Nz+Pj4YMmSJQCAw4cPPzS2e/fuSE5OFguZ+7788kvY2NhIthS6Xr16mDBhAgICAjBs2LCHxikUClhYWMDc3FzcV1RUhHXr1pWLNVQXTKfT4Y033oBCocDOnTsRExODxYsXY/PmzY99biJ68vA+NmSSfH19ERcXh7CwMHh7e2PUqFFo0aIFSktLceTIEaxYsQJeXl4ICAiAh4cHRowYgcWLF8PMzAx9+vTBuXPnMGXKFLi5ueH99983WF59+/aFg4MDhg8fjpkzZ8LCwgJr167FhQsX9OKWLVuG5ORk9OvXDw0aNMDdu3fFlUc9evR46PmnTZuGbdu2oVu3bpg6dSocHBywfv16bN++HXPmzIFKpTLYZ3nQrFmzKo3p168f5s+fj6CgIIwYMQLXrl3DvHnzKlyS37JlS8THx+Obb75BkyZNUKNGjUeaFzNt2jT8+uuvSEpKgqurKyIjI7Fnzx4MHz4crVu3RuPGjat9TiJ6crGwIZMVGhqKdu3aYcGCBZg9ezY0Gg0sLS3x3HPPISgoCGPGjBFj4+Li0LRpU6xatQpLliyBSqVC7969ERMTU+GcmkdlZ2eHxMRERERE4M0330Tt2rXxzjvvoE+fPnjnnXfEuFatWiEpKQnTpk2DRqNBzZo14eXlha1bt4pzVCri4eGBffv24cMPP8To0aNRVFSE5s2bY82aNdW6g69UXnnlFaxevRqzZ89GQEAA6tWrh9DQUDg7O2P48OF6sTNmzEBeXh5CQ0Nx69YtNGzYUO8+P1Wxa9cuxMTEYMqUKXqdt7Vr16J169YYNGgQUlJSYGVlZYiPR0RPAIUg/OOuWERERERPMc6xISIiIpPBwoaIiIhMBgsbIiIiMhksbIiIiMhksLAhIiIik8HChoiIiEwGCxsiIiIyGSZ5gz7r1mMqDyJ6BhWkxRo7BaInTg0ZfhMa6vdS0RH+DFeGHRsiIiIyGSbZsSEiInqiKNhHkAsLGyIiIqkpFMbO4JnBwoaIiEhq7NjIht80ERERmQx2bIiIiKTGoSjZsLAhIiKSGoeiZMNvmoiIiEwGOzZERERS41CUbFjYEBERSY1DUbLhN01EREQmgx0bIiIiqXEoSjYsbIiIiKTGoSjZ8JsmIiIik8GODRERkdQ4FCUbFjZERERS41CUbFjYEBERSY0dG9mwhCQiIiKTwcKGiIhIagozw2zVtHfvXgQEBECtVkOhUGDLli3isdLSUkyaNAktW7aEra0t1Go
      "text/plain": [
       "<Figure size 640x480 with 2 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
    "plt.xlabel('Predicted')\n",
    "plt.ylabel('Actual')\n",
    "plt.title('Confusion Matrix')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
Ajout fichier .py pour nettoyage et fusions 2024-02-11 23:55:11 +01:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2",`
			`"metadata": {},`
			`"source": [`
			`"# Segmentation des clients par régression logistique"`
			`]`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 1,`
			`"id": "bca785be-39f7-4583-9bd8-67c1134ae275",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import pandas as pd\n",`
			`"import numpy as np\n",`
			`"import os\n",`
			`"import s3fs\n",`
			`"import re\n",`
			`"from sklearn.linear_model import LogisticRegression\n",`
			`"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",`
			`"from sklearn.preprocessing import StandardScaler\n",`
			`"import seaborn as sns\n",`
			`"import matplotlib.pyplot as plt"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 2,`
			`"id": "59ce5096-4e2c-45c1-be78-43e14db4142c",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# # modification des variables categorielles\n",`
			`" \n",`
			`"# ### variable gender\n",`
			`"# df1_customer_product[\"gender_label\"] = df1_customer_product[\"gender\"].map({\n",`
			`"# 0: 'female',\n",`
			`"# 1: 'male',\n",`
			`"# 2: 'other'\n",`
			`"# })\n",`
			`" \n",`
			`"# ### variable country -> on indique si le pays est france\n",`
			`"# df1_customer_product[\"country_fr\"] = df1_customer_product[\"country\"].apply(lambda x : int(x==\"fr\") if pd.notna(x) else np.nan)\n",`
			`"\n",`
			`"# # Création des indicatrices de gender\n",`
			`"# gender_dummies = pd.get_dummies(df1_customer_product[\"gender_label\"], prefix='gender').astype(int)\n",`
			`" \n",`
			`"# # Concaténation des indicatrices avec le dataframe d'origine\n",`
			`"# df1_customer_product = pd.concat([df1_customer_product, gender_dummies], axis=1)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 3,`
			`"id": "3bf57816-b023-4e84-9450-095620bddebc",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Create filesystem object\n",`
			`"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",`
			`"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 4,`
			`"id": "27002f2f-a78a-414c-8e4f-b15bf6dd9e40",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stderr",`
			`"output_type": "stream",`
			`"text": [`
Event type 5 2024-02-13 00:00:09 +01:00			`"/tmp/ipykernel_7740/1677066092.py:7: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`" dataset_train = pd.read_csv(file_in, sep=\",\")\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`"/tmp/ipykernel_7740/1677066092.py:12: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`" dataset_test = pd.read_csv(file_in, sep=\",\")\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"# Importation des données\n",`
			`"BUCKET = \"projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach\"\n",`
			`"\n",`
			`"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_train.csv\"\n",`
			`"\n",`
			`"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",`
			`" dataset_train = pd.read_csv(file_in, sep=\",\")\n",`
			`"\n",`
			`"FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_test.csv\"\n",`
			`"\n",`
			`"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",`
			`" dataset_test = pd.read_csv(file_in, sep=\",\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 5,`
			`"id": "c3928b55-8821-46da-b3b5-a036efd6d2cf",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/html": [`
			`"<div>\n",`
			`"<style scoped>\n",`
			`" .dataframe tbody tr th:only-of-type {\n",`
			`" vertical-align: middle;\n",`
			`" }\n",`
			`"\n",`
			`" .dataframe tbody tr th {\n",`
			`" vertical-align: top;\n",`
			`" }\n",`
			`"\n",`
			`" .dataframe thead th {\n",`
			`" text-align: right;\n",`
			`" }\n",`
			`"</style>\n",`
			`"<table border=\"1\" class=\"dataframe\">\n",`
			`" <thead>\n",`
			`" <tr style=\"text-align: right;\">\n",`
			`" <th></th>\n",`
			`" <th>event_type_id</th>\n",`
			`" <th>name_event_types</th>\n",`
			`" </tr>\n",`
			`" </thead>\n",`
			`" <tbody>\n",`
			`" <tr>\n",`
			`" <th>0</th>\n",`
			`" <td>2.0</td>\n",`
			`" <td>offre muséale individuel</td>\n",`
			`" </tr>\n",`
			`" <tr>\n",`
			`" <th>1</th>\n",`
			`" <td>4.0</td>\n",`
			`" <td>spectacle vivant</td>\n",`
			`" </tr>\n",`
			`" <tr>\n",`
			`" <th>2</th>\n",`
			`" <td>5.0</td>\n",`
			`" <td>offre muséale groupe</td>\n",`
			`" </tr>\n",`
			`" <tr>\n",`
			`" <th>3</th>\n",`
			`" <td>NaN</td>\n",`
			`" <td>NaN</td>\n",`
			`" </tr>\n",`
			`" </tbody>\n",`
			`"</table>\n",`
			`"</div>"`
			`],`
			`"text/plain": [`
			`" event_type_id name_event_types\n",`
			`"0 2.0 offre muséale individuel\n",`
			`"1 4.0 spectacle vivant\n",`
			`"2 5.0 offre muséale groupe\n",`
			`"3 NaN NaN"`
			`]`
			`},`
			`"execution_count": 5,`
			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"dataset_train[['event_type_id', 'name_event_types']].drop_duplicates()"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": 6,`
			`"id": "7e8a9d4d-7e55-4173-a7f4-8b8baa9610d2",`
			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"#Choose type of event \n",`
			`"type_event_choosed = 5\n",`
			`"\n",`
			`"dataset_test = dataset_test[(dataset_test['event_type_id'] == type_event_choosed) \| np.isnan(dataset_test['event_type_id'])]\n",`
			`"dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",`
			`"dataset_train = dataset_train[(dataset_train['event_type_id'] == type_event_choosed) \| np.isnan(dataset_train['event_type_id'])]\n",`
			`"dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Event type 5 2024-02-13 00:00:09 +01:00			`"execution_count": 7,`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"id": "e20ced8f-df1c-43bb-8d15-79f414c8225c",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
			`"text/plain": [`
			`"customer_id 0.000000\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`"event_type_id 0.950522\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"nb_tickets 0.000000\n",`
			`"nb_purchases 0.000000\n",`
			`"total_amount 0.000000\n",`
			`"nb_suppliers 0.000000\n",`
			`"vente_internet_max 0.000000\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`"purchase_date_min 0.950522\n",`
			`"purchase_date_max 0.950522\n",`
			`"time_between_purchase 0.950522\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"nb_tickets_internet 0.000000\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`"name_event_types 0.950522\n",`
			`"avg_amount 0.950522\n",`
			`"birthdate 0.961918\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"street_id 0.000000\n",`
			`"is_partner 0.000000\n",`
			`"gender 0.000000\n",`
			`"is_email_true 0.000000\n",`
			`"opt_in 0.000000\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`"structure_id 0.863048\n",`
			`"profession 0.952160\n",`
			`"language 0.991778\n",`
			`"mcp_contact_id 0.297275\n",`
			`"last_buying_date 0.611718\n",`
			`"max_price 0.611718\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"ticket_sum 0.000000\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`"average_price 0.102225\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"fidelity 0.000000\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`"average_purchase_delay 0.611718\n",`
			`"average_price_basket 0.611718\n",`
			`"average_ticket_basket 0.611718\n",`
			`"total_price 0.509493\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"purchase_count 0.000000\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`"first_buying_date 0.611718\n",`
			`"country 0.063488\n",`
			`"age 0.961918\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"tenant_id 0.000000\n",`
			`"nb_campaigns 0.000000\n",`
			`"nb_campaigns_opened 0.000000\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`"time_to_open 0.543355\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"y_has_purchased 0.000000\n",`
			`"dtype: float64"`
			`]`
			`},`
Event type 5 2024-02-13 00:00:09 +01:00			`"execution_count": 7,`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"metadata": {},`
			`"output_type": "execute_result"`
			`}`
			`],`
			`"source": [`
			`"dataset_train.isna().sum()/len(dataset_train)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Event type 5 2024-02-13 00:00:09 +01:00			`"execution_count": 8,`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"id": "34bae3f7-d579-4f80-a38d-a83eb5ea8a7b",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"name": "stdout",`
			`"output_type": "stream",`
			`"text": [`
Event type 5 2024-02-13 00:00:09 +01:00			`"Accuracy: 0.9985491193310349\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"Confusion Matrix:\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`" [[127988 49]\n",`
			`" [ 137 24]]\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"Classification Report:\n",`
			`" precision recall f1-score support\n",`
			`"\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`" 0.0 1.00 1.00 1.00 128037\n",`
			`" 1.0 0.33 0.15 0.21 161\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"\n",`
Event type 5 2024-02-13 00:00:09 +01:00			`" accuracy 1.00 128198\n",`
			`" macro avg 0.66 0.57 0.60 128198\n",`
			`"weighted avg 1.00 1.00 1.00 128198\n",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"\n"`
			`]`
			`}`
			`],`
			`"source": [`
			`"\n",`
			`"reg_columns = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet', 'opt_in', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",`
			`"\n",`
			`"X_train = dataset_train[reg_columns]\n",`
			`"y_train = dataset_train['y_has_purchased']\n",`
			`"X_test = dataset_test[reg_columns]\n",`
			`"y_test = dataset_test['y_has_purchased']\n",`
			`"\n",`
			`"# Fit and transform the scaler on the training data\n",`
			`"scaler = StandardScaler()\n",`
			`"\n",`
			`"# Transform the test data using the same scaler\n",`
			`"X_train_scaled = scaler.fit_transform(X_train)\n",`
			`"X_test_scaled = scaler.fit_transform(X_test)\n",`
			`"\n",`
			`"# Create and fit the linear regression model\n",`
			`"logit_model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)\n",`
			`"logit_model.fit(X_train_scaled, y_train)\n",`
			`"\n",`
			`"y_pred = logit_model.predict(X_test_scaled)\n",`
			`"\n",`
			`"#Evaluation du modèle \n",`
			`"accuracy = accuracy_score(y_test, y_pred)\n",`
			`"conf_matrix = confusion_matrix(y_test, y_pred)\n",`
			`"class_report = classification_report(y_test, y_pred)\n",`
			`"\n",`
			`"print(\"Accuracy:\", accuracy)\n",`
			`"print(\"Confusion Matrix:\\n\", conf_matrix)\n",`
			`"print(\"Classification Report:\\n\", class_report)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Event type 5 2024-02-13 00:00:09 +01:00			`"execution_count": 9,`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"id": "ccc78c36-3287-46e6-89ac-7494c1a7106a",`
			`"metadata": {},`
			`"outputs": [`
			`{`
			`"data": {`
Event type 5 2024-02-13 00:00:09 +01:00			"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjYAAAHFCAYAAADhWLMfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8g+/7EAAAACXBIWXMAAA9hAAAPYQGoP6dpAABUp0lEQVR4nO3dfVyN9/8H8Nfp7qjo6EblkPvWJBsyCXMz5K7iu+2LZdFmMaE1uVkzt9vK3RcjcjM3m7G2ITM3LdPGGpFoijBETEducpBUO12/P/xc21FWca7rcLye38f1eDjX9T7X9T7HN733/nw+16UQBEEAERERkQkwM3YCRERERIbCwoaIiIhMBgsbIiIiMhksbIiIiMhksLAhIiIik8HChoiIiEwGCxsiIiIyGSxsiIiIyGSwsCEiIiKTwcKGTNrRo0fx1ltvoXHjxqhRowZq1qyJNm3aYM6cObh+/bqk1z5y5Ai6dOkClUoFhUKBhQsXGvwaCoUC06dPN/h5K7N27VooFAooFAr88ssv5Y4LgoBmzZpBoVCga9euj3SNpUuXYu3atdV6zy+//PLQnIjo2WBh7ASIpLJy5UqEhYXBw8MDEyZMgKenJ0pLS3Ho0CEsW7YM+/fvR0JCgmTXf/vtt1FYWIj4+HjY29ujUaNGBr/G/v37Ub9+fYOft6pq1aqFVatWlSte9uzZgzNnzqBWrVqPfO6lS5fCyckJISEhVX5PmzZtsH//fnh6ej7ydYno6cbChkzS/v37MWrUKPTs2RNbtmyBUqkUj/Xs2RORkZFITEyUNIesrCyEhoaiT58+kl2jffv2kp27KgYNGoT169djyZIlsLOzE/evWrUKvr6+uHnzpix5lJaWQqFQwM7OzujfCREZF4eiyCRFR0dDoVBgxYoVekXNfVZWVggMDBRfl5WVYc6cOXj++eehVCrh7OyMoUOH4uLFi3rv69q1K7y8vJCWloaXX34ZNjY2aNKkCWbNmoWysjIAfw/T/PXXX4iLixOHbABg+vTp4p//6f57zp07J+5LTk5G165d4ejoCGtrazRo0ACvvfYa7ty5I8ZUNBSVlZWF/v37w97eHjVq1ECrVq3wxRdf6MXcH7L5+uuvMXnyZKjVatjZ2aFHjx44efJk1b5kAG+88QYA4Ouvvxb3abVabNq0CW+//XaF75kxYwZ8fHzg4OAAOzs7tGnTBqtWrcI/n8fbqFEjHDt2DHv27BG/v/sdr/u5r1u3DpGRkahXrx6USiVOnz5dbijq6tWrcHNzQ4cOHVBaWiqe//jx47C1tUVwcHCVPysRPR1Y2JDJ0el0SE5Ohre3N9zc3Kr0nlGjRmHSpEno2bMntm7dio8//hiJiYno0KEDrl69qher0WgwZMgQvPnmm9i6dSv69OmDqKgofPXVVwCAfv36Yf/+/QCA119/Hfv37xdfV9W5c+fQr18/WFlZYfXq1UhMTMSsWbNga2uLkpKSh77v5MmT6NChA44dO4ZFixZh8+bN8PT0REhICObMmVMu/sMPP8T58+fx+eefY8WKFfjjjz8QEBAAnU5XpTzt7Ozw+uuvY/Xq1eK+r7/+GmZmZhg0aNBDP9vIkSPx7bffYvPmzXj11VcxduxYfPzxx2JMQkICmjRpgtatW4vf34PDhlFRUcjNzcWyZcvwww8/wNnZudy1nJycEB8fj7S0NEyaNAkAcOfOHfz3v/9FgwYNsGzZsip9TiJ6ighEJkaj0QgAhMGDB1cpPjs7WwAghIWF6e0/cOCAAED48MMPxX1dunQRAAgHDhzQi/X09BR69eqltw+AMHr0aL1906ZNEyr6sVuzZo0AQMjJyREEQRA2btwoABAyMjL+NXcAwrRp08TXgwcPFpRKpZCbm6sX16dPH8HGxka4ceOGIAiC8PPPPwsAhL59++rFffvttwIAYf/+/f963fv5pqWliefKysoSBEEQXnrpJSEkJEQQBEFo0aKF0KVLl4eeR6fTCaWlpcLMmTMFR0dHoaysTDz2sPfev17nzp0feuznn3/W2z979mwBgJCQkCAMGzZMsLa2Fo4ePfqvn5GInk7s2NAz7+effwaAcpNU27Vrh+bNm2P37t16+11dXdGuXTu9fS+88ALOnz9vsJxatWoFKysrjBgxAl988QXOnj1bpfclJyeje/fu5TpVISEhuHPnTrnO0T+H44B7nwNAtT5Lly5d0LRpU6xevRqZmZlIS0t76DDU/Rx79OgBlUoFc3NzWFpaYurUqbh27Rry8/OrfN3XXnutyrETJkxAv3798MYbb+CLL77A4sWL0bJlyyq/n4ieHixsyOQ4OTnBxsYGOTk5VYq/du0aAKBu3brljqnVavH4fY6OjuXilEolioqKHiHbijVt2hQ//fQTnJ2dMXr0aDRt2hRNmzbFZ5999q/vu3bt2kM/x/3j//TgZ7k/H6k6n0WhUOCtt97CV199hWXLluG5557Dyy+/XGHswYMH4efnB+DeqrXffvsNaWlpmDx5crWvW9Hn/LccQ0JCcPfuXbi6unJuDZEJY2FDJsfc3Bzdu3dHenp6ucm/Fbn/yz0vL6/csUuXLsHJyclgudWoUQMAUFxcrLf/wXk8APDyyy/jhx9+gFarRWpqKnx9fREREYH4+PiHnt/R0fGhnwOAQT/LP4WEhODq1atYtmwZ3nrrrYfGxcfHw9LSEtu2bcPAgQPRoUMHtG3b9pGuWdEk7IfJy8vD6NGj0apVK1y7dg3jx49/pGsS0ZOPhQ2ZpKioKAiCgNDQ0Aon25aWluKHH34AALzyyisAIE7+vS8tLQ3Z2dno3r27wfK6v7Ln6NGjevvv51IRc3Nz+Pj4YMmSJQCAw4cPPzS2e/fuSE5OFguZ+7788kvY2NhIthS6Xr16mDBhAgICAjBs2LCHxikUClhYWMDc3FzcV1RUhHXr1pWLNVQXTKfT4Y033oBCocDOnTsRExODxYsXY/PmzY99biJ68vA+NmSSfH19ERcXh7CwMHh7e2PUqFFo0aIFSktLceTIEaxYsQJeXl4ICAiAh4cHRowYgcWLF8PMzAx9+vTBuXPnMGXKFLi5ueH99983WF59+/aFg4MDhg8fjpkzZ8LCwgJr167FhQsX9OKWLVuG5ORk9OvXDw0aNMDdu3fFlUc9evR46PmnTZuGbdu2oVu3bpg6dSocHBywfv16bN++HXPmzIFKpTLYZ3nQrFmzKo3p168f5s+fj6CgIIwYMQLXrl3DvHnzKlyS37JlS8THx+Obb75BkyZNUKNGjUeaFzNt2jT8+uuvSEpKgqurKyIjI7Fnzx4MHz4crVu3RuPGjat9TiJ6crGwIZMVGhqKdu3aYcGCBZg9ezY0Gg0sLS3x3HPPISgoCGPGjBFj4+Li0LRpU6xatQpLliyBSqVC7969ERMTU+GcmkdlZ2eHxMRERERE4M0330Tt2rXxzjvvoE+fPnjnnXfEuFatWiEpKQnTpk2DRqNBzZo14eXlha1bt4pzVCri4eGBffv24cMPP8To0aNRVFSE5s2bY82aNdW6g69UXnnlFaxevRqzZ89GQEAA6tWrh9DQUDg7O2P48OF6sTNmzEBeXh5CQ0Nx69YtNGzYUO8+P1Wxa9cuxMTEYMqUKXqdt7Vr16J169YYNGgQUlJSYGVlZYiPR0RPAIUg/OOuWERERERPMc6xISIiIpPBwoaIiIhMBgsbIiIiMhksbIiIiMhksLAhIiIik8HChoiIiEwGCxsiIiIyGSZ5gz7r1mMqDyJ6BhWkxRo7BaInTg0ZfhMa6vdS0RH+DFeGHRsiIiIyGSbZsSEiInqiKNhHkAsLGyIiIqkpFMbO4JnBwoaIiEhq7NjIht80ERERmQx2bIiIiKTGoSjZsLAhIiKSGoeiZMNvmoiIiEwGOzZERERS41CUbFjYEBERSY1DUbLhN01EREQmgx0bIiIiqXEoSjYsbIiIiKTGoSjZ8JsmIiIik8GODRERkdQ4FCUbFjZERERS41CUbFjYEBERSY0dG9mwhCQiIiKTwcKGiIhIagozw2zVtHfvXgQEBECtVkOhUGDLli3isdLSUkyaNAktW7aEra0t1Go
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"text/plain": [`
			`"<Figure size 640x480 with 2 Axes>"`
			`]`
			`},`
			`"metadata": {},`
			`"output_type": "display_data"`
			`}`
			`],`
			`"source": [`
			`"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",`
			`"plt.xlabel('Predicted')\n",`
			`"plt.ylabel('Actual')\n",`
			`"plt.title('Confusion Matrix')\n",`
			`"plt.show()"`
			`]`
Ajout fichier .py pour nettoyage et fusions 2024-02-11 23:55:11 +01:00			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
Ajout régression logistique 2024-02-12 23:49:13 +01:00			`"version": "3.11.6"`
Ajout fichier .py pour nettoyage et fusions 2024-02-11 23:55:11 +01:00			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 5`
			`}`