{ "cells": [ { "cell_type": "markdown", "id": "84b6e27e-4bda-4d38-8689-ec7fc0da1848", "metadata": {}, "source": [ "# Define segment and predict sales associated" ] }, { "cell_type": "markdown", "id": "ec059482-45d3-4ae6-99bc-9b4ced115db3", "metadata": {}, "source": [ "## Importations of packages " ] }, { "cell_type": "code", "execution_count": 1, "id": "9771bf29-d08e-4674-8c23-9a2672fbef8f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pandas import DataFrame\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n", "from sklearn.utils import class_weight\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n", "from sklearn.naive_bayes import GaussianNB\n", "from scipy.optimize import fsolve\n", "import io\n", "\n", "import pickle\n", "import warnings" ] }, { "cell_type": "markdown", "id": "048fcd7c-800a-4a6b-b725-faf8410f924a", "metadata": {}, "source": [ "## load databases" ] }, { "cell_type": "code", "execution_count": 2, "id": "539ccbdf-f29f-4f04-99c1-8c88d0efe514", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 75, "id": "d6017ed0-6233-4888-85a7-05dec50a255b", "metadata": {}, "outputs": [], "source": [ "type_of_activity = \"musique\"" ] }, { "cell_type": "code", "execution_count": 4, "id": "0c3a6ddc-9345-4a42-b6bf-a20a95de3028", "metadata": {}, "outputs": [], "source": [ "def load_train_test(type_of_activity):\n", " # BUCKET = f\"projet-bdc2324-team1/Generalization/{type_of_activity}\"\n", " BUCKET = f\"projet-bdc2324-team1/Generalization_v2/{type_of_activity}\"\n", " File_path_train = BUCKET + \"/Train_set.csv\"\n", " File_path_test = BUCKET + \"/Test_set.csv\"\n", " \n", " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", " # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", "\n", " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n", " # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", " \n", " return dataset_train, dataset_test" ] }, { "cell_type": "code", "execution_count": 76, "id": "2831d546-b365-498b-8248-c618bd9c3057", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_552/3983721681.py:8: DtypeWarning: Columns (10,19,20,21,24) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", "/tmp/ipykernel_552/3983721681.py:12: DtypeWarning: Columns (19,20,21,24) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n" ] }, { "data": { "text/plain": [ "customer_id 0\n", "street_id 0\n", "structure_id 327020\n", "mcp_contact_id 135470\n", "fidelity 0\n", " ... \n", "purchases_8_2021 113963\n", "purchases_8_2022 0\n", "purchases_9_2021 113963\n", "purchases_9_2022 0\n", "y_has_purchased 0\n", "Length: 87, dtype: int64" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train, dataset_test = load_train_test(type_of_activity)\n", "dataset_train.isna().sum()" ] }, { "cell_type": "code", "execution_count": 17, "id": "b8827f7b-b304-4f51-9814-c7a98ed88cf0", "metadata": {}, "outputs": [], "source": [ "def features_target_split(dataset_train, dataset_test):\n", " \n", " features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'purchase_date_min', 'purchase_date_max', \n", " 'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',, 'vente_internet_max'\n", " 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", " # we suppress fidelity, time between purchase, and gender other (colinearity issue)\n", " \"\"\"\n", " features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', \n", " 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true', \n", " 'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']\n", " \"\"\"\n", " \n", " X_train = dataset_train # [features_l]\n", " y_train = dataset_train[['y_has_purchased']]\n", "\n", " X_test = dataset_test # [features_l]\n", " y_test = dataset_test[['y_has_purchased']]\n", " return X_train, X_test, y_train, y_test" ] }, { "cell_type": "code", "execution_count": 77, "id": "c18195fc-ed40-4e39-a59e-c9ecc5a8e6c3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape train : (354365, 87)\n", "Shape test : (151874, 87)\n" ] } ], "source": [ "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)\n", "print(\"Shape train : \", X_train.shape)\n", "print(\"Shape test : \", X_test.shape)" ] }, { "cell_type": "markdown", "id": "74eda066-5e01-43aa-b0cf-cc6d9bbf770e", "metadata": {}, "source": [ "## get results from the logit cross validated model" ] }, { "cell_type": "code", "execution_count": 78, "id": "7c81390e-598c-4f02-bd56-dd03b00dcb33", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idstreet_idstructure_idmcp_contact_idfidelitytenant_idis_partnerdeleted_atis_email_trueopt_in...purchases_5_2022purchases_6_2021purchases_6_2022purchases_7_2021purchases_7_2022purchases_8_2021purchases_8_2022purchases_9_2021purchases_9_2022y_has_purchased
010_699783139NaN186852.00875FalseNaNTrue0...0.00.00.00.00.00.00.00.00.00.0
110_38307862NaN17621.07875FalseNaNTrue0...0.00.00.00.00.00.00.00.00.00.0
210_5561011063NaN136909.00875FalseNaNTrue1...0.00.00.00.00.00.00.00.00.00.0
310_686663443226NaN186611.01875FalseNaNTrue1...0.00.00.00.00.00.01.00.00.00.0
410_91656316684NaN21559.02875FalseNaNTrue0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
15186914_1843791718883224.0394849.01862FalseNaNTrue1...0.0NaN0.0NaN0.0NaN0.0NaN0.00.0
15187014_4630858741826NaN1555631.00862FalseNaNTrue1...0.0NaN0.0NaN0.0NaN0.0NaN0.00.0
15187114_4659926871477NaN1542180.00862FalseNaNTrue1...0.0NaN0.0NaN0.0NaN0.0NaN0.00.0
15187214_4881492917272NaNNaN1862FalseNaNTrue1...0.0NaN0.0NaN0.0NaN0.0NaN1.00.0
15187314_81242762NaN10077.02862FalseNaNTrue0...0.0NaN0.0NaN0.0NaN0.0NaN0.00.0
\n", "

151874 rows × 87 columns

\n", "
" ], "text/plain": [ " customer_id street_id structure_id mcp_contact_id fidelity \\\n", "0 10_699783 139 NaN 186852.0 0 \n", "1 10_38307 862 NaN 17621.0 7 \n", "2 10_556101 1063 NaN 136909.0 0 \n", "3 10_686663 443226 NaN 186611.0 1 \n", "4 10_91656 316684 NaN 21559.0 2 \n", "... ... ... ... ... ... \n", "151869 14_1843791 718883 224.0 394849.0 1 \n", "151870 14_4630858 741826 NaN 1555631.0 0 \n", "151871 14_4659926 871477 NaN 1542180.0 0 \n", "151872 14_4881492 917272 NaN NaN 1 \n", "151873 14_8124 2762 NaN 10077.0 2 \n", "\n", " tenant_id is_partner deleted_at is_email_true opt_in ... \\\n", "0 875 False NaN True 0 ... \n", "1 875 False NaN True 0 ... \n", "2 875 False NaN True 1 ... \n", "3 875 False NaN True 1 ... \n", "4 875 False NaN True 0 ... \n", "... ... ... ... ... ... ... \n", "151869 862 False NaN True 1 ... \n", "151870 862 False NaN True 1 ... \n", "151871 862 False NaN True 1 ... \n", "151872 862 False NaN True 1 ... \n", "151873 862 False NaN True 0 ... \n", "\n", " purchases_5_2022 purchases_6_2021 purchases_6_2022 purchases_7_2021 \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "151869 0.0 NaN 0.0 NaN \n", "151870 0.0 NaN 0.0 NaN \n", "151871 0.0 NaN 0.0 NaN \n", "151872 0.0 NaN 0.0 NaN \n", "151873 0.0 NaN 0.0 NaN \n", "\n", " purchases_7_2022 purchases_8_2021 purchases_8_2022 \\\n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "3 0.0 0.0 1.0 \n", "4 0.0 0.0 0.0 \n", "... ... ... ... \n", "151869 0.0 NaN 0.0 \n", "151870 0.0 NaN 0.0 \n", "151871 0.0 NaN 0.0 \n", "151872 0.0 NaN 0.0 \n", "151873 0.0 NaN 0.0 \n", "\n", " purchases_9_2021 purchases_9_2022 y_has_purchased \n", "0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 \n", "... ... ... ... \n", "151869 NaN 0.0 0.0 \n", "151870 NaN 0.0 0.0 \n", "151871 NaN 0.0 0.0 \n", "151872 NaN 1.0 0.0 \n", "151873 NaN 0.0 0.0 \n", "\n", "[151874 rows x 87 columns]" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test" ] }, { "cell_type": "code", "execution_count": 20, "id": "c708f439-bb75-4688-bf4f-4c04e13deaae", "metadata": {}, "outputs": [], "source": [ "def load_model(type_of_activity, model):\n", " # BUCKET = f\"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/\"\n", " BUCKET = f\"projet-bdc2324-team1/basique/{type_of_activity}/{model}/\"\n", " filename = model + '.pkl'\n", " file_path = BUCKET + filename\n", " with fs.open(file_path, mode=\"rb\") as f:\n", " model_bytes = f.read()\n", "\n", " model = pickle.loads(model_bytes)\n", " return model" ] }, { "cell_type": "code", "execution_count": 92, "id": "5261a803-05b8-41a0-968c-dc7bde48ddd3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('preprocessor',\n",
       "                 ColumnTransformer(transformers=[('num',\n",
       "                                                  Pipeline(steps=[('imputer',\n",
       "                                                                   SimpleImputer(fill_value=0,\n",
       "                                                                                 strategy='constant')),\n",
       "                                                                  ('scaler',\n",
       "                                                                   StandardScaler())]),\n",
       "                                                  ['nb_campaigns',\n",
       "                                                   'taux_ouverture_mail',\n",
       "                                                   'prop_purchases_internet',\n",
       "                                                   'nb_tickets', 'nb_purchases',\n",
       "                                                   'total_amount',\n",
       "                                                   'nb_suppliers',\n",
       "                                                   'purchases_10_2021',\n",
       "                                                   'purchases_10_2022',\n",
       "                                                   'purchases_...\n",
       "                                                   'categorie_age_40_50',\n",
       "                                                   'categorie_age_50_60',\n",
       "                                                   'categorie_age_60_70',\n",
       "                                                   'categorie_age_70_80',\n",
       "                                                   'categorie_age_plus_80',\n",
       "                                                   'categorie_age_inconnue',\n",
       "                                                   'country_fr',\n",
       "                                                   'is_profession_known',\n",
       "                                                   'is_zipcode_known',\n",
       "                                                   'opt_in'])])),\n",
       "                ('LogisticRegression_Benchmark',\n",
       "                 LogisticRegression(class_weight={0.0: 0.5480249666729557,\n",
       "                                                  1.0: 5.705625684291879},\n",
       "                                    max_iter=5000, n_jobs=-1, solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('imputer',\n", " SimpleImputer(fill_value=0,\n", " strategy='constant')),\n", " ('scaler',\n", " StandardScaler())]),\n", " ['nb_campaigns',\n", " 'taux_ouverture_mail',\n", " 'prop_purchases_internet',\n", " 'nb_tickets', 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'purchases_10_2021',\n", " 'purchases_10_2022',\n", " 'purchases_...\n", " 'categorie_age_40_50',\n", " 'categorie_age_50_60',\n", " 'categorie_age_60_70',\n", " 'categorie_age_70_80',\n", " 'categorie_age_plus_80',\n", " 'categorie_age_inconnue',\n", " 'country_fr',\n", " 'is_profession_known',\n", " 'is_zipcode_known',\n", " 'opt_in'])])),\n", " ('LogisticRegression_Benchmark',\n", " LogisticRegression(class_weight={0.0: 0.5480249666729557,\n", " 1.0: 5.705625684291879},\n", " max_iter=5000, n_jobs=-1, solver='saga'))])" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = load_model(type_of_activity, \"LogisticRegression_Benchmark\")\n", "# model = load_model(type_of_activity, \"randomF_cv\")\n", "model" ] }, { "cell_type": "markdown", "id": "006819e7-e9c5-48d9-85ee-aa43d5e4c9c2", "metadata": {}, "source": [ "## Quartile clustering" ] }, { "cell_type": "code", "execution_count": 93, "id": "018d8ff4-3436-4eec-8507-d1a265cbabf1", "metadata": {}, "outputs": [], "source": [ "y_pred = model.predict(X_test)\n", "y_pred_prob = model.predict_proba(X_test)[:, 1]" ] }, { "cell_type": "code", "execution_count": 94, "id": "846f53b9-73c2-4a8b-9d9e-f11bf59ce9ba", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idstreet_idstructure_idmcp_contact_idfidelitytenant_idis_partnerdeleted_atis_email_trueopt_in...purchases_8_2021purchases_8_2022purchases_9_2021purchases_9_2022y_has_purchasedhas_purchasedhas_purchased_estimscorequartilescore_adjusted
010_699783139NaN186852.00875FalseNaNTrue0...0.00.00.00.00.00.00.00.13118010.017574
110_38307862NaN17621.07875FalseNaNTrue0...0.00.00.00.00.00.00.00.32163520.042466
210_5561011063NaN136909.00875FalseNaNTrue1...0.00.00.00.00.00.00.00.00506810.000676
310_686663443226NaN186611.01875FalseNaNTrue1...0.01.00.00.00.00.00.00.16697910.018397
410_91656316684NaN21559.02875FalseNaNTrue0...0.00.00.00.00.00.00.00.16152310.018632
510_35956106204NaNNaN1875FalseNaNTrue0...0.00.00.00.00.00.00.00.09813910.010129
610_5600581063NaN161812.00875FalseNaNTrue1...0.00.00.00.00.00.00.00.00537710.000715
710_386035136421865.07660.04875FalseNaNTrue1...0.00.00.00.01.01.01.00.90669840.461388
810_5632941063NaN167549.00875FalseNaNTrue1...0.00.00.00.00.00.00.00.00739910.000974
910_548983268636NaN173318.01875FalseNaNTrue0...0.00.00.00.00.00.00.00.16352910.022102
\n", "

10 rows × 92 columns

\n", "
" ], "text/plain": [ " customer_id street_id structure_id mcp_contact_id fidelity tenant_id \\\n", "0 10_699783 139 NaN 186852.0 0 875 \n", "1 10_38307 862 NaN 17621.0 7 875 \n", "2 10_556101 1063 NaN 136909.0 0 875 \n", "3 10_686663 443226 NaN 186611.0 1 875 \n", "4 10_91656 316684 NaN 21559.0 2 875 \n", "5 10_35956 106204 NaN NaN 1 875 \n", "6 10_560058 1063 NaN 161812.0 0 875 \n", "7 10_38603 513642 1865.0 7660.0 4 875 \n", "8 10_563294 1063 NaN 167549.0 0 875 \n", "9 10_548983 268636 NaN 173318.0 1 875 \n", "\n", " is_partner deleted_at is_email_true opt_in ... purchases_8_2021 \\\n", "0 False NaN True 0 ... 0.0 \n", "1 False NaN True 0 ... 0.0 \n", "2 False NaN True 1 ... 0.0 \n", "3 False NaN True 1 ... 0.0 \n", "4 False NaN True 0 ... 0.0 \n", "5 False NaN True 0 ... 0.0 \n", "6 False NaN True 1 ... 0.0 \n", "7 False NaN True 1 ... 0.0 \n", "8 False NaN True 1 ... 0.0 \n", "9 False NaN True 0 ... 0.0 \n", "\n", " purchases_8_2022 purchases_9_2021 purchases_9_2022 y_has_purchased \\\n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 1.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "5 0.0 0.0 0.0 0.0 \n", "6 0.0 0.0 0.0 0.0 \n", "7 0.0 0.0 0.0 1.0 \n", "8 0.0 0.0 0.0 0.0 \n", "9 0.0 0.0 0.0 0.0 \n", "\n", " has_purchased has_purchased_estim score quartile score_adjusted \n", "0 0.0 0.0 0.131180 1 0.017574 \n", "1 0.0 0.0 0.321635 2 0.042466 \n", "2 0.0 0.0 0.005068 1 0.000676 \n", "3 0.0 0.0 0.166979 1 0.018397 \n", "4 0.0 0.0 0.161523 1 0.018632 \n", "5 0.0 0.0 0.098139 1 0.010129 \n", "6 0.0 0.0 0.005377 1 0.000715 \n", "7 1.0 1.0 0.906698 4 0.461388 \n", "8 0.0 0.0 0.007399 1 0.000974 \n", "9 0.0 0.0 0.163529 1 0.022102 \n", "\n", "[10 rows x 92 columns]" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment = X_test\n", "\n", "X_test_segment[\"has_purchased\"] = y_test\n", "X_test_segment[\"has_purchased_estim\"] = y_pred\n", "X_test_segment[\"score\"] = y_pred_prob\n", "X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n", " np.where(X_test['score']<0.5, '2',\n", " np.where(X_test['score']<0.75, '3', '4')))\n", "X_test_segment.head(10)" ] }, { "cell_type": "code", "execution_count": 24, "id": "fb592fe3-ea40-4e83-8fe9-c52b9ee42f2a", "metadata": {}, "outputs": [], "source": [ "def df_segment(df, y, model) :\n", "\n", " y_pred = model.predict(df)\n", " y_pred_prob = model.predict_proba(df)[:, 1]\n", "\n", " df_segment = df\n", "\n", " df_segment[\"has_purchased\"] = y\n", " df_segment[\"has_purchased_estim\"] = y_pred\n", " df_segment[\"score\"] = y_pred_prob\n", " df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n", " np.where(df_segment['score']<0.5, '2',\n", " np.where(df_segment['score']<0.75, '3', '4')))\n", "\n", " return df_segment" ] }, { "cell_type": "code", "execution_count": 88, "id": "968645d5-58cc-485a-bd8b-99f4cfc26fec", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1080/2624515794.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_segment[\"has_purchased\"] = y\n", "/tmp/ipykernel_1080/2624515794.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_segment[\"has_purchased_estim\"] = y_pred\n", "/tmp/ipykernel_1080/2624515794.py:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_segment[\"score\"] = y_pred_prob\n", "/tmp/ipykernel_1080/2624515794.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelity...opt_ingender_femalegender_malegender_othernb_campaignsnb_campaigns_openedhas_purchasedhas_purchased_estimscorequartile
04.01.0100.001.00.05.1771875.1771870.0000000.01...False1000.00.00.00.00.0060661
11.01.055.001.00.0426.265613426.2656130.0000000.02...True0100.00.01.00.00.2888472
217.01.080.001.00.0436.033437436.0334370.0000000.02...True1000.00.00.00.00.1032641
34.01.0120.001.00.05.1964125.1964120.0000000.01...False1000.00.00.00.00.0089281
434.02.0416.001.00.0478.693148115.631470363.0616780.04...False1000.00.01.01.00.9928094
..................................................................
960911.01.067.311.01.0278.442257278.4422570.0000001.02...False01015.05.01.00.00.3517622
960921.01.061.411.01.0189.207373189.2073730.0000001.01...False01012.09.00.01.00.5678143
960930.00.00.000.00.0550.000000550.000000-1.0000000.01...True10029.03.00.00.00.0046521
960941.01.079.431.01.0279.312905279.3129050.0000001.01...False01020.04.00.00.00.2930422
960950.00.00.000.00.0550.000000550.000000-1.0000000.02...False01031.04.00.01.00.7878524
\n", "

96096 rows × 21 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 4.0 1.0 100.00 1.0 \n", "1 1.0 1.0 55.00 1.0 \n", "2 17.0 1.0 80.00 1.0 \n", "3 4.0 1.0 120.00 1.0 \n", "4 34.0 2.0 416.00 1.0 \n", "... ... ... ... ... \n", "96091 1.0 1.0 67.31 1.0 \n", "96092 1.0 1.0 61.41 1.0 \n", "96093 0.0 0.0 0.00 0.0 \n", "96094 1.0 1.0 79.43 1.0 \n", "96095 0.0 0.0 0.00 0.0 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 0.0 5.177187 5.177187 \n", "1 0.0 426.265613 426.265613 \n", "2 0.0 436.033437 436.033437 \n", "3 0.0 5.196412 5.196412 \n", "4 0.0 478.693148 115.631470 \n", "... ... ... ... \n", "96091 1.0 278.442257 278.442257 \n", "96092 1.0 189.207373 189.207373 \n", "96093 0.0 550.000000 550.000000 \n", "96094 1.0 279.312905 279.312905 \n", "96095 0.0 550.000000 550.000000 \n", "\n", " time_between_purchase nb_tickets_internet fidelity ... opt_in \\\n", "0 0.000000 0.0 1 ... False \n", "1 0.000000 0.0 2 ... True \n", "2 0.000000 0.0 2 ... True \n", "3 0.000000 0.0 1 ... False \n", "4 363.061678 0.0 4 ... False \n", "... ... ... ... ... ... \n", "96091 0.000000 1.0 2 ... False \n", "96092 0.000000 1.0 1 ... False \n", "96093 -1.000000 0.0 1 ... True \n", "96094 0.000000 1.0 1 ... False \n", "96095 -1.000000 0.0 2 ... False \n", "\n", " gender_female gender_male gender_other nb_campaigns \\\n", "0 1 0 0 0.0 \n", "1 0 1 0 0.0 \n", "2 1 0 0 0.0 \n", "3 1 0 0 0.0 \n", "4 1 0 0 0.0 \n", "... ... ... ... ... \n", "96091 0 1 0 15.0 \n", "96092 0 1 0 12.0 \n", "96093 1 0 0 29.0 \n", "96094 0 1 0 20.0 \n", "96095 0 1 0 31.0 \n", "\n", " nb_campaigns_opened has_purchased has_purchased_estim score \\\n", "0 0.0 0.0 0.0 0.006066 \n", "1 0.0 1.0 0.0 0.288847 \n", "2 0.0 0.0 0.0 0.103264 \n", "3 0.0 0.0 0.0 0.008928 \n", "4 0.0 1.0 1.0 0.992809 \n", "... ... ... ... ... \n", "96091 5.0 1.0 0.0 0.351762 \n", "96092 9.0 0.0 1.0 0.567814 \n", "96093 3.0 0.0 0.0 0.004652 \n", "96094 4.0 0.0 0.0 0.293042 \n", "96095 4.0 0.0 1.0 0.787852 \n", "\n", " quartile \n", "0 1 \n", "1 2 \n", "2 1 \n", "3 1 \n", "4 4 \n", "... ... \n", "96091 2 \n", "96092 3 \n", "96093 1 \n", "96094 2 \n", "96095 4 \n", "\n", "[96096 rows x 21 columns]" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_segment(X_test, y_test, model)" ] }, { "cell_type": "markdown", "id": "ad16b8ab-7e01-404b-971e-866e9b9d5aa4", "metadata": {}, "source": [ "## definition of functions to compute the bias of scores and adjust it \n", "\n", "Le biais est calculé de la façon suivante. \n", "En notant $\\hat{p(x_i)}$ le score calculé (estimé par la modélisation) et $p(x_i)$ le vrai score (sans biais), et $\\beta$ le logarithme du biais, on a : \\\n", "$\\ln{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}} = \\beta + \\ln{\\frac{p(x_i)}{1-p(x_i)}}$ \\\n", "$ \\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}} = \\exp(\\beta) . \\frac{p(x_i)}{1-p(x_i)} $ , soit : \\\n", "$p(x_i) = {\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}}$ \\\n", "Ce qu'on appelle biais et qu'on estime dans le code par la suite est : $B=\\exp(\\beta) $. Les probabilités ne sont donc pas biaisées si $B=1$. Il y a surestimation si $B>1$. \n", "\n", "On cherche le B qui permette d'ajuster les probabilités de telle sorte que la somme des scores soit égale à la somme des y_has_purchased. Cela revient à résoudre : \n", "\n", "\\begin{equation}\n", "\\sum_{i}{\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}} = \\sum_{i}{Y_i}\n", "\\end{equation}\n", "\n", "C'est ce que fait la fonction find_bias. \n", "\n", "Note sur les notations : \\\n", "$\\hat{p(x_i)}$ correspond à ce qu'on appelle le score et $p(x_i)$ à ce qu'on appellera le score adjusted" ] }, { "cell_type": "code", "execution_count": 25, "id": "f0379536-a6c5-4b16-bde5-d0319ec1b140", "metadata": {}, "outputs": [], "source": [ "# compute adjusted score from odd ratios (cf formula above)\n", "def adjusted_score(odd_ratio, bias) :\n", " adjusted_score = odd_ratio/(bias+odd_ratio)\n", " return adjusted_score" ] }, { "cell_type": "code", "execution_count": 26, "id": "32a0dfd0-f49d-4785-a56f-706d381bfe41", "metadata": {}, "outputs": [], "source": [ "# when the score is 1 we cannot compute the odd ratio, so we adjust scores equal to 1\n", "# we set the second best score instead\n", "\n", "def adjust_score_1(score) :\n", " second_best_score = np.array([element for element in score if element !=1]).max()\n", " new_score = np.array([element if element!=1 else second_best_score for element in score]) \n", " return new_score" ] }, { "cell_type": "code", "execution_count": 27, "id": "2dff1def-02df-413e-afce-b4aeaf7752b6", "metadata": {}, "outputs": [], "source": [ "def odd_ratio(score) :\n", " return score / (1 - score)" ] }, { "cell_type": "code", "execution_count": 28, "id": "683d71fc-7442-4028-869c-49c57592d6e9", "metadata": {}, "outputs": [], "source": [ "# definition of a function that automatically detects the bias\n", "\n", "def find_bias(odd_ratios, y_objective, initial_guess=6) :\n", " \"\"\"\n", " results = minimize(lambda bias : (sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective)**2 ,\n", " initial_guess , method = \"BFGS\")\n", "\n", " estimated_bias = results.x[0]\n", " \"\"\"\n", "\n", " # faster method\n", " bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n", " \n", " return bias_estimated[0]" ] }, { "cell_type": "code", "execution_count": 95, "id": "f17dc6ca-7a48-441b-8c04-11c47b8b9741", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.3000275047453295 0.08797424180570736\n" ] }, { "data": { "text/plain": [ "0.08763280798047211" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(X_test_segment[\"score\"].mean(), y_test[\"y_has_purchased\"].mean())\n", "y_train[\"y_has_purchased\"].mean()" ] }, { "cell_type": "code", "execution_count": 96, "id": "781b0d40-c954-4c54-830a-e709c8667328", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10.698758485840244" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# computation with the function defined\n", "\n", "bias_test_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_test_segment[\"score\"])), \n", " y_objective = y_test[\"y_has_purchased\"].sum(),\n", " initial_guess=6)\n", "bias_test_set" ] }, { "cell_type": "code", "execution_count": 97, "id": "248cb862-418e-4767-9933-70c4885ecf40", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10.688693734338177" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# comparison with bias of the train set\n", "X_train_score = model.predict_proba(X_train)[:, 1]\n", "\n", "bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), \n", " y_objective = y_train[\"y_has_purchased\"].sum(),\n", " initial_guess=10)\n", "bias_train_set" ] }, { "cell_type": "code", "execution_count": 98, "id": "fff6cbe6-7bb3-4732-9b81-b9ac5383bbcf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "betâ test - betâ train = 0.00094118290869078\n" ] } ], "source": [ "print(\"betâ test - betâ train = \",np.log(bias_test_set/bias_train_set))" ] }, { "cell_type": "code", "execution_count": 99, "id": "f506870d-4a8a-4b2c-8f0b-e0789080b20c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mean absolute erreur 4.674943825828751e-05\n" ] } ], "source": [ "# impact of considering a bias computed on train set instead of test set - totally neglectable\n", "\n", "score_adjusted_test = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n", "score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n", "\n", "print(\"mean absolute erreur\",abs(score_adjusted_test-score_adjusted_train).mean())" ] }, { "cell_type": "code", "execution_count": 100, "id": "8213d0e4-063b-49fa-90b7-677fc34f4c01", "metadata": {}, "outputs": [], "source": [ "# adjust scores accordingly \n", "\n", "# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n", "\n", "# actually, we are not supposed to have X_test, so the biais is estimated on X_train\n", "# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n", "X_test_segment[\"score_adjusted\"] = score_adjusted_train" ] }, { "cell_type": "code", "execution_count": 101, "id": "834d3723-2e72-4c65-9c62-e2d595c69461", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE for score : 0.12309116071575532\n", "MSE for ajusted score : 0.05482346713233594\n", "sum of y_has_purchased : 13361.0\n", "sum of adjusted scores : 13368.100024185826\n" ] } ], "source": [ "# check \n", "\n", "MSE_score = ((X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"])**2).mean()\n", "MSE_ajusted_score = ((X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"])**2).mean()\n", "print(f\"MSE for score : {MSE_score}\")\n", "print(f\"MSE for ajusted score : {MSE_ajusted_score}\")\n", "\n", "print(\"sum of y_has_purchased :\",y_test[\"y_has_purchased\"].sum())\n", "print(\"sum of adjusted scores :\", X_test_segment[\"score_adjusted\"].sum())" ] }, { "cell_type": "code", "execution_count": 102, "id": "9f30a4dd-a9d8-405a-a7d5-5324ae88cf70", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MAE for score : 0.25695361997840177\n", "MAE for adjusted score : 0.10450649550597542\n" ] } ], "source": [ "# mean absolute error - divided by 2 with out method\n", "\n", "MAE_score = abs(X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"]).mean()\n", "MAE_ajusted_score = abs(X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"]).mean()\n", "print(f\"MAE for score : {MAE_score}\")\n", "print(f\"MAE for adjusted score : {MAE_ajusted_score}\")" ] }, { "cell_type": "code", "execution_count": 37, "id": "6f9396db-e213-408c-a596-eaeec3bc79f3", "metadata": {}, "outputs": [], "source": [ "# visualization\n", "\n", "# histogramme des probas et des probas ajustées\n", "\n", "def plot_hist_scores(df, score, score_adjusted, type_of_activity) :\n", "\n", " plt.figure()\n", " plt.hist(df[score], label = \"score\", alpha=0.6)\n", " plt.hist(df[score_adjusted], label=\"adjusted score\", alpha=0.6)\n", " plt.legend()\n", " plt.xlabel(\"probability of a future purchase\")\n", " plt.ylabel(\"count\")\n", " plt.title(f\"Comparison between score and adjusted score for {type_of_activity} companies\")\n", " # plt.show()" ] }, { "cell_type": "code", "execution_count": 64, "id": "def64c16-f4dd-493c-909c-d886d7f53947", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'projet-bdc2324-team1/Output_expected_CA/sport/hist_score_adjustedsport.png'" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PATH + file_name + type_of_activity + \".png\"" ] }, { "cell_type": "code", "execution_count": 103, "id": "b478d40d-9677-4204-87bd-16fb0bc1fe9a", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\", type_of_activity = type_of_activity)" ] }, { "cell_type": "code", "execution_count": 40, "id": "add631d7-0757-45a5-bb5b-f7f4b4baa961", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "projet-bdc2324-team1/Output_expected_CA/sport/\n" ] } ], "source": [ "# define path so save graphics\n", "\n", "# define type of activity \n", "type_of_activity = \"sport\"\n", "PATH = f\"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/\"\n", "print(PATH)" ] }, { "cell_type": "code", "execution_count": 68, "id": "3a5b5bd9-e033-4436-8c56-bf5fb61df87f", "metadata": {}, "outputs": [], "source": [ "# export png \n", "\n", "# plot adjusted scores and save (to be tested)\n", "plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\", type_of_activity = type_of_activity)\n", "\n", "image_buffer = io.BytesIO()\n", "plt.savefig(image_buffer, format='png')\n", "image_buffer.seek(0)\n", "file_name = \"hist_score_adjusted_\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".png\"\n", "with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:\n", " s3_file.write(image_buffer.read())\n", "plt.close()" ] }, { "cell_type": "markdown", "id": "e6fae260-fab8-4f51-90dc-9b6d7314c77b", "metadata": {}, "source": [ "## Compute number of tickets and CA by segment with the recalibrated score" ] }, { "cell_type": "code", "execution_count": 104, "id": "90c4c2b5-0ede-4001-889f-749cfbd9df04", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quartilescore (%)score adjusted (%)has purchased (%)
018.800.941.02
1236.165.174.70
2361.0613.3314.62
3489.8653.7453.19
\n", "
" ], "text/plain": [ " quartile score (%) score adjusted (%) has purchased (%)\n", "0 1 8.80 0.94 1.02\n", "1 2 36.16 5.17 4.70\n", "2 3 61.06 13.33 14.62\n", "3 4 89.86 53.74 53.19" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_table_adjusted_scores = (100 * X_test_segment.groupby(\"quartile\")[[\"score\",\"score_adjusted\", \"has_purchased\"]].mean()).round(2).reset_index()\n", "X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f\"{col.replace('_', ' ')} (%)\" for col in X_test_table_adjusted_scores.columns if col in [\"score\",\"score_adjusted\", \"has_purchased\"]})\n", "X_test_table_adjusted_scores" ] }, { "cell_type": "code", "execution_count": 162, "id": "d0b8740c-cf48-4a3e-83cb-23d95059f62f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\\\begin{tabular}{lrrr}\\n\\\\toprule\\nquartile & score (%) & score adjusted (%) & has purchased (%) \\\\\\\\\\n\\\\midrule\\n1 & 13.250000 & 2.510000 & 1.570000 \\\\\\\\\\n2 & 33.890000 & 8.000000 & 9.850000 \\\\\\\\\\n3 & 63.060000 & 22.580000 & 21.470000 \\\\\\\\\\n4 & 90.520000 & 66.200000 & 65.010000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'" ] }, "execution_count": 162, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_table_adjusted_scores.to_latex(index=False)" ] }, { "cell_type": "code", "execution_count": 43, "id": "d6a04d3e-c454-43e4-ae4c-0746e928575b", "metadata": {}, "outputs": [], "source": [ "# comparison between score and adjusted score - export csv associated\n", "\n", "file_name = \"table_adjusted_score_\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n", "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", " X_test_table_adjusted_scores.to_csv(file_out, index = False)" ] }, { "cell_type": "code", "execution_count": 40, "id": "a974589f-7952-4db2-bebf-7b69c6b09372", "metadata": {}, "outputs": [], "source": [ "def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :\n", " \n", " duration_ratio = duration_ref/duration_projection\n", "\n", " df_output = df\n", "\n", " df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n", " df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n", " \n", " df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n", " df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n", "\n", " df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n", " \n", " return df_output\n" ] }, { "cell_type": "code", "execution_count": 41, "id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idstreet_idstructure_idmcp_contact_idfidelitytenant_idis_partnerdeleted_atis_email_trueopt_in...has_purchasedhas_purchased_estimscorequartilescore_adjustednb_tickets_projectedtotal_amount_projectednb_tickets_expectedtotal_amount_expectedpace_purchase
01_81918114NaN834.001311FalseNaNTrue1...0.00.00.40854620.0270660.0000000.0000000.0000000.000000NaN
11_147922NaN251178.001311FalseNaNTrue1...0.00.00.02704610.0011180.0000000.0000000.0000000.000000NaN
21_304662NaN2355.001311FalseNaNTrue1...0.00.00.18085110.0088130.0000000.0000000.0000000.000000NaN
31_4189820244203714.097973.001311FalseNaNTrue1...0.00.00.22087210.0112880.0000000.0000000.0000000.000000NaN
41_587462NaN82026.011311FalseNaNTrue1...0.00.00.10095110.0045020.0000000.0000000.0000000.000000NaN
..................................................................
1861154_24295103884NaN96913.001342FalseNaNTrue1...0.00.00.46664420.0340370.0000000.0000000.0000000.000000NaN
1861164_4444343315NaN234734.001342FalseNaNTrue0...0.00.00.42764120.0292110.0000000.0000000.0000000.000000NaN
1861174_33439472NaNNaN11342FalseNaNTrue0...0.00.00.46846420.0342780.70588220.4705880.0241960.70168617.0
1861184_4775246460NaN89791.001342FalseNaNTrue1...0.00.00.36010020.0221610.0000000.0000000.0000000.000000NaN
1861194_3544934592NaN119197.001342FalseNaNTrue1...0.01.00.72890730.0977050.0000000.0000000.0000000.000000NaN
\n", "

186120 rows × 97 columns

\n", "
" ], "text/plain": [ " customer_id street_id structure_id mcp_contact_id fidelity \\\n", "0 1_8191 8114 NaN 834.0 0 \n", "1 1_14792 2 NaN 251178.0 0 \n", "2 1_30466 2 NaN 2355.0 0 \n", "3 1_41898 20244 203714.0 97973.0 0 \n", "4 1_58746 2 NaN 82026.0 1 \n", "... ... ... ... ... ... \n", "186115 4_24295 103884 NaN 96913.0 0 \n", "186116 4_44443 43315 NaN 234734.0 0 \n", "186117 4_3343947 2 NaN NaN 1 \n", "186118 4_47752 46460 NaN 89791.0 0 \n", "186119 4_35449 34592 NaN 119197.0 0 \n", "\n", " tenant_id is_partner deleted_at is_email_true opt_in ... \\\n", "0 1311 False NaN True 1 ... \n", "1 1311 False NaN True 1 ... \n", "2 1311 False NaN True 1 ... \n", "3 1311 False NaN True 1 ... \n", "4 1311 False NaN True 1 ... \n", "... ... ... ... ... ... ... \n", "186115 1342 False NaN True 1 ... \n", "186116 1342 False NaN True 0 ... \n", "186117 1342 False NaN True 0 ... \n", "186118 1342 False NaN True 1 ... \n", "186119 1342 False NaN True 1 ... \n", "\n", " has_purchased has_purchased_estim score quartile score_adjusted \\\n", "0 0.0 0.0 0.408546 2 0.027066 \n", "1 0.0 0.0 0.027046 1 0.001118 \n", "2 0.0 0.0 0.180851 1 0.008813 \n", "3 0.0 0.0 0.220872 1 0.011288 \n", "4 0.0 0.0 0.100951 1 0.004502 \n", "... ... ... ... ... ... \n", "186115 0.0 0.0 0.466644 2 0.034037 \n", "186116 0.0 0.0 0.427641 2 0.029211 \n", "186117 0.0 0.0 0.468464 2 0.034278 \n", "186118 0.0 0.0 0.360100 2 0.022161 \n", "186119 0.0 1.0 0.728907 3 0.097705 \n", "\n", " nb_tickets_projected total_amount_projected nb_tickets_expected \\\n", "0 0.000000 0.000000 0.000000 \n", "1 0.000000 0.000000 0.000000 \n", "2 0.000000 0.000000 0.000000 \n", "3 0.000000 0.000000 0.000000 \n", "4 0.000000 0.000000 0.000000 \n", "... ... ... ... \n", "186115 0.000000 0.000000 0.000000 \n", "186116 0.000000 0.000000 0.000000 \n", "186117 0.705882 20.470588 0.024196 \n", "186118 0.000000 0.000000 0.000000 \n", "186119 0.000000 0.000000 0.000000 \n", "\n", " total_amount_expected pace_purchase \n", "0 0.000000 NaN \n", "1 0.000000 NaN \n", "2 0.000000 NaN \n", "3 0.000000 NaN \n", "4 0.000000 NaN \n", "... ... ... \n", "186115 0.000000 NaN \n", "186116 0.000000 NaN \n", "186117 0.701686 17.0 \n", "186118 0.000000 NaN \n", "186119 0.000000 NaN \n", "\n", "[186120 rows x 97 columns]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment = project_tickets_CA (X_test_segment, \"nb_purchases\", \"nb_tickets\", \"total_amount\", \"score_adjusted\", \n", " duration_ref=17, duration_projection=12)\n", "X_test_segment" ] }, { "cell_type": "code", "execution_count": 42, "id": "cb66a8ea-65f7-460f-b3fc-ba76a3b91faa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "quartile\n", "1 16.722853\n", "2 16.568788\n", "3 15.765899\n", "4 13.263500\n", "Name: pace_purchase, dtype: float64" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment.groupby(\"quartile\")[\"pace_purchase\"].mean()" ] }, { "cell_type": "code", "execution_count": 43, "id": "f58f9151-2f91-45df-abb7-1ddcf0652adc", "metadata": {}, "outputs": [], "source": [ "# generalization with a function\n", "\n", "def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n", " duration_ref=17, duration_projection=12) :\n", " \n", " # compute nb tickets estimated and total amount expected\n", " df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n", " \n", " # number of customers by segment\n", " df_expected_CA.insert(1, \"size\", df.groupby(segment).size().values)\n", " \n", " # size in percent of all customers\n", " df_expected_CA.insert(2, \"size_perct\", 100 * df_expected_CA[\"size\"]/df_expected_CA[\"size\"].sum())\n", " \n", " # compute share of CA recovered\n", " duration_ratio=duration_ref/duration_projection\n", " \n", " df_expected_CA[\"revenue_recovered_perct\"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \\\n", " df.groupby(segment)[total_amount].sum().values\n", "\n", " df_drop_null_pace = df.dropna(subset=[pace_purchase])\n", " df_expected_CA[\"pace_purchase\"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values\n", " \n", " return df_expected_CA" ] }, { "cell_type": "code", "execution_count": 44, "id": "c8df6c80-43e8-4f00-9cd3-eb9022744313", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quartilesizesize_perctnb_tickets_expectedtotal_amount_expectedrevenue_recovered_perctpace_purchase
018162243.85263.123258.540.8816.72
126081132.671984.5627052.822.4716.57
232891315.533476.6343945.796.3415.77
34147747.9458598.68523568.9360.0313.26
\n", "
" ], "text/plain": [ " quartile size size_perct nb_tickets_expected total_amount_expected \\\n", "0 1 81622 43.85 263.12 3258.54 \n", "1 2 60811 32.67 1984.56 27052.82 \n", "2 3 28913 15.53 3476.63 43945.79 \n", "3 4 14774 7.94 58598.68 523568.93 \n", "\n", " revenue_recovered_perct pace_purchase \n", "0 0.88 16.72 \n", "1 2.47 16.57 \n", "2 6.34 15.77 \n", "3 60.03 13.26 " ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment=\"quartile\", \n", " nb_tickets_expected=\"nb_tickets_expected\", total_amount_expected=\"total_amount_expected\", \n", " total_amount=\"total_amount\", pace_purchase=\"pace_purchase\"),2)\n", "\n", "X_test_expected_CA" ] }, { "cell_type": "code", "execution_count": 64, "id": "ac706ed7-defa-4df1-82e1-06f12fc1b6ad", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\\\begin{tabular}{lrrrrrr}\\n\\\\toprule\\nquartile & size & size (%) & nb tickets expected & total amount expected & revenue recovered (%) & pace purchase \\\\\\\\\\n\\\\midrule\\n1 & 53626 & 35.310000 & 398.260000 & 13949.330000 & 2.350000 & 16.480000 \\\\\\\\\\n2 & 55974 & 36.860000 & 3113.770000 & 101639.450000 & 6.240000 & 16.470000 \\\\\\\\\\n3 & 30435 & 20.040000 & 6214.350000 & 208267.220000 & 14.270000 & 15.710000 \\\\\\\\\\n4 & 11839 & 7.800000 & 72929.460000 & 1835702.430000 & 75.380000 & 11.480000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Création du dictionnaire de mapping pour les noms de colonnes\n", "mapping_dict = {col: col.replace(\"perct\", \"(%)\").replace(\"_\", \" \") for col in X_test_expected_CA.columns}\n", "\n", "X_test_expected_CA.rename(columns=mapping_dict).to_latex(index=False)" ] }, { "cell_type": "code", "execution_count": 122, "id": "771da0cf-c49f-4e7e-b52f-ebcfb0fb2df3", "metadata": {}, "outputs": [], "source": [ "# export summary table to the MinIO storage\n", "\n", "file_name = \"table_expected_CA_\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n", "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", " X_test_expected_CA.to_csv(file_out, index = False)" ] }, { "cell_type": "code", "execution_count": 53, "id": "c805dc10-4d07-4f7d-a677-5461a92845d7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'projet-bdc2324-team1/Output_expected_CA/musique/table_expected_CA_musique.csv'" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PATH = f\"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/\"\n", "file_name = \"table_expected_CA_\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n", "FILE_PATH_OUT_S3" ] }, { "cell_type": "markdown", "id": "e35ccfff-1845-41f0-9bde-f09b09b67877", "metadata": {}, "source": [ "## Test : vizu tables saved" ] }, { "cell_type": "code", "execution_count": 66, "id": "4e9e88e4-ea10-41f4-9bf1-20b55269a20d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quartilescore (%)score adjusted (%)has purchased (%)
0113.252.511.57
1233.898.009.85
2363.0622.5821.47
3490.5266.2065.01
\n", "
" ], "text/plain": [ " quartile score (%) score adjusted (%) has purchased (%)\n", "0 1 13.25 2.51 1.57\n", "1 2 33.89 8.00 9.85\n", "2 3 63.06 22.58 21.47\n", "3 4 90.52 66.20 65.01" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path = 'projet-bdc2324-team1/Output_expected_CA/sport/table_adjusted_scoresport.csv'\n", "\n", "with fs.open( path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in, sep=\",\")\n", "df" ] }, { "cell_type": "markdown", "id": "9c471bdd-25c2-420a-a8a1-3add9f003cbc", "metadata": {}, "source": [ "## Just to try, same computation with score instead of score adjusted\n", "\n", "seems overestimated : if only 14% of customers come back, how can we recover 22% of the revenue from the segment that is least likely to buy ?? ..." ] }, { "cell_type": "code", "execution_count": 80, "id": "53684a24-1809-465f-8e21-b9295e34582a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quartilesizesize_perctnb_tickets_expectedtotal_amount_expectedperct_revenue_recovered
013741038.93419.769245.0821.71
122951730.7211549.06296522.0239.24
232013720.9629997.85954751.9163.34
3490329.40244655.8210736011.9597.72
\n", "
" ], "text/plain": [ " quartile size size_perct nb_tickets_expected total_amount_expected \\\n", "0 1 37410 38.93 419.76 9245.08 \n", "1 2 29517 30.72 11549.06 296522.02 \n", "2 3 20137 20.96 29997.85 954751.91 \n", "3 4 9032 9.40 244655.82 10736011.95 \n", "\n", " perct_revenue_recovered \n", "0 21.71 \n", "1 39.24 \n", "2 63.34 \n", "3 97.72 " ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment_bis = project_tickets_CA (X_test_segment, \"nb_tickets\", \"total_amount\", \"score\", duration_ref=1.5, duration_projection=1)\n", "\n", "X_test_expected_CA_bis = round(summary_expected_CA(df=X_test_segment_bis, segment=\"quartile\", nb_tickets_expected=\"nb_tickets_expected\", \n", " total_amount_expected=\"total_amount_expected\", total_amount=\"total_amount\"),2)\n", "\n", "X_test_expected_CA_bis" ] }, { "cell_type": "code", "execution_count": 81, "id": "7dc66d1e-da03-4513-96e4-d9a43ac0a2c8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "overall share of revenue recovered : 90.26 %\n" ] } ], "source": [ "print(\"overall share of revenue recovered : \", round(100 * duration_ratio * X_test_expected_CA_bis[\"total_amount_expected\"].sum() / \\\n", "X_test_segment_bis[\"total_amount\"].sum(),2), \"%\")" ] }, { "cell_type": "markdown", "id": "673f2969-7b9a-44c1-abf5-5679fca877ce", "metadata": {}, "source": [ "## Last pieces of analysis" ] }, { "cell_type": "code", "execution_count": 161, "id": "2365bb13-0f3f-49d5-bf91-52c92abebcee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "overall share of revenue recovered : 77.64%\n" ] } ], "source": [ "# global revenue recovered\n", "global_revenue_recovered = round(100 * duration_ratio * X_test_expected_CA[\"total_amount_expected\"].sum() / \\\n", "X_test_segment[\"total_amount\"].sum(),2)\n", "print(f\"overall share of revenue recovered : {global_revenue_recovered}%\")" ] }, { "cell_type": "code", "execution_count": 163, "id": "16b17f35-57dd-459a-8989-129143dc0952", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.018093\n", "1 0.721519\n", "2 3.336101\n", "3 95.924287\n", "Name: total_amount_expected, dtype: float64" ] }, "execution_count": 163, "metadata": {}, "output_type": "execute_result" } ], "source": [ "100 * X_test_expected_CA[\"total_amount_expected\"]/X_test_expected_CA[\"total_amount_expected\"].sum()" ] }, { "cell_type": "code", "execution_count": 166, "id": "dee4a200-eefe-4377-8e80-59ad33edd3c0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "quartile\n", "1 0.320407\n", "2 5.685020\n", "3 11.339715\n", "4 82.654858\n", "Name: total_amount, dtype: float64" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# le segment 4 représente 83% du CA actuel et 96% du CA lié aux anciens clients pour l'année prochaine\n", "100 * X_test_segment.groupby(\"quartile\")[\"total_amount\"].sum()/X_test_segment[\"total_amount\"].sum()" ] }, { "cell_type": "code", "execution_count": 177, "id": "c1e6f020-ef18-40b4-bfc1-19f98cb2796e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 96096.000000\n", "mean 207.475735\n", "std 4720.046248\n", "min -48831.800000\n", "25% 0.000000\n", "50% 0.000000\n", "75% 60.000000\n", "max 624890.000000\n", "Name: total_amount, dtype: float64" ] }, "execution_count": 177, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment[\"total_amount\"].describe() # total amount négatif ???\n" ] }, { "cell_type": "code", "execution_count": 184, "id": "d301a50e-7c68-40f0-9245-a4eea64c387b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 -4.883180e+04\n", "1 -6.483180e+04\n", "2 -7.683860e+04\n", "3 -8.683860e+04\n", "4 -9.683860e+04\n", " ... \n", "96091 1.802247e+07\n", "96092 1.839238e+07\n", "96093 1.877219e+07\n", "96094 1.931270e+07\n", "96095 1.993759e+07\n", "Name: total_amount, Length: 96096, dtype: float64" ] }, "execution_count": 184, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.cumsum(X_test_segment[\"total_amount\"].sort_values()).reset_index()[\"total_amount\"]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }