{ "cells": [ { "cell_type": "markdown", "id": "ac01a6ea-bef6-4ace-89ff-1dc03a4215c2", "metadata": {}, "source": [ "# Segmentation des clients par régression logistique" ] }, { "cell_type": "code", "execution_count": 1, "id": "bca785be-39f7-4583-9bd8-67c1134ae275", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", "from sklearn.preprocessing import StandardScaler\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "id": "59ce5096-4e2c-45c1-be78-43e14db4142c", "metadata": {}, "outputs": [], "source": [ "# # modification des variables categorielles\n", " \n", "# ### variable gender\n", "# df1_customer_product[\"gender_label\"] = df1_customer_product[\"gender\"].map({\n", "# 0: 'female',\n", "# 1: 'male',\n", "# 2: 'other'\n", "# })\n", " \n", "# ### variable country -> on indique si le pays est france\n", "# df1_customer_product[\"country_fr\"] = df1_customer_product[\"country\"].apply(lambda x : int(x==\"fr\") if pd.notna(x) else np.nan)\n", "\n", "# # Création des indicatrices de gender\n", "# gender_dummies = pd.get_dummies(df1_customer_product[\"gender_label\"], prefix='gender').astype(int)\n", " \n", "# # Concaténation des indicatrices avec le dataframe d'origine\n", "# df1_customer_product = pd.concat([df1_customer_product, gender_dummies], axis=1)" ] }, { "cell_type": "code", "execution_count": 3, "id": "3bf57816-b023-4e84-9450-095620bddebc", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 4, "id": "27002f2f-a78a-414c-8e4f-b15bf6dd9e40", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_7740/1677066092.py:7: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", "/tmp/ipykernel_7740/1677066092.py:12: DtypeWarning: Columns (21,39) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n" ] } ], "source": [ "# Importation des données\n", "BUCKET = \"projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach\"\n", "\n", "FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_train.csv\"\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", "\n", "FILE_PATH_S3 = BUCKET + \"/\" + \"dataset_test.csv\"\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "c3928b55-8821-46da-b3b5-a036efd6d2cf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | event_type_id | \n", "name_event_types | \n", "
---|---|---|
0 | \n", "2.0 | \n", "offre muséale individuel | \n", "
1 | \n", "4.0 | \n", "spectacle vivant | \n", "
2 | \n", "5.0 | \n", "offre muséale groupe | \n", "
3 | \n", "NaN | \n", "NaN | \n", "