{ "cells": [ { "cell_type": "markdown", "id": "84b6e27e-4bda-4d38-8689-ec7fc0da1848", "metadata": {}, "source": [ "# Define segment and predict sales associated" ] }, { "cell_type": "markdown", "id": "ec059482-45d3-4ae6-99bc-9b4ced115db3", "metadata": {}, "source": [ "## Importations of packages " ] }, { "cell_type": "code", "execution_count": 70, "id": "9771bf29-d08e-4674-8c23-9a2672fbef8f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pandas import DataFrame\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n", "from sklearn.utils import class_weight\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n", "from sklearn.naive_bayes import GaussianNB\n", "from scipy.optimize import fsolve\n", "import io\n", "\n", "import pickle\n", "import warnings" ] }, { "cell_type": "markdown", "id": "048fcd7c-800a-4a6b-b725-faf8410f924a", "metadata": {}, "source": [ "## load databases" ] }, { "cell_type": "code", "execution_count": 71, "id": "539ccbdf-f29f-4f04-99c1-8c88d0efe514", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 270, "id": "d6017ed0-6233-4888-85a7-05dec50a255b", "metadata": {}, "outputs": [], "source": [ "type_of_activity = \"musee\"" ] }, { "cell_type": "code", "execution_count": 73, "id": "0c3a6ddc-9345-4a42-b6bf-a20a95de3028", "metadata": {}, "outputs": [], "source": [ "def load_train_test(type_of_activity):\n", " BUCKET = f\"projet-bdc2324-team1/Generalization/{type_of_activity}\"\n", " File_path_train = BUCKET + \"/Train_set.csv\"\n", " File_path_test = BUCKET + \"/Test_set.csv\"\n", " \n", " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", " # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", "\n", " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n", " # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", " \n", " return dataset_train, dataset_test" ] }, { "cell_type": "code", "execution_count": 271, "id": "2831d546-b365-498b-8248-c618bd9c3057", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1080/2350085345.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n" ] }, { "data": { "text/plain": [ "customer_id 0\n", "nb_tickets 0\n", "nb_purchases 0\n", "total_amount 0\n", "nb_suppliers 0\n", "vente_internet_max 0\n", "purchase_date_min 0\n", "purchase_date_max 0\n", "time_between_purchase 0\n", "nb_tickets_internet 0\n", "street_id 0\n", "structure_id 389658\n", "mcp_contact_id 150354\n", "fidelity 0\n", "tenant_id 0\n", "is_partner 0\n", "deleted_at 434278\n", "gender 0\n", "is_email_true 0\n", "opt_in 0\n", "last_buying_date 183987\n", "max_price 183987\n", "ticket_sum 0\n", "average_price 94783\n", "average_purchase_delay 183987\n", "average_price_basket 183987\n", "average_ticket_basket 183987\n", "total_price 89204\n", "purchase_count 0\n", "first_buying_date 183987\n", "country 141237\n", "gender_label 0\n", "gender_female 0\n", "gender_male 0\n", "gender_other 0\n", "country_fr 141237\n", "nb_campaigns 0\n", "nb_campaigns_opened 0\n", "time_to_open 258182\n", "y_has_purchased 0\n", "dtype: int64" ] }, "execution_count": 271, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train, dataset_test = load_train_test(type_of_activity)\n", "dataset_train.isna().sum()" ] }, { "cell_type": "code", "execution_count": 77, "id": "b8827f7b-b304-4f51-9814-c7a98ed88cf0", "metadata": {}, "outputs": [], "source": [ "def features_target_split(dataset_train, dataset_test):\n", " \n", " features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n", " 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n", " 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", " # we suppress fidelity, time between purchase, and gender other (colinearity issue)\n", " \"\"\"\n", " features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', \n", " 'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true', \n", " 'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']\n", " \"\"\"\n", " \n", " X_train = dataset_train[features_l]\n", " y_train = dataset_train[['y_has_purchased']]\n", "\n", " X_test = dataset_test[features_l]\n", " y_test = dataset_test[['y_has_purchased']]\n", " return X_train, X_test, y_train, y_test" ] }, { "cell_type": "code", "execution_count": 272, "id": "c18195fc-ed40-4e39-a59e-c9ecc5a8e6c3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape train : (434278, 17)\n", "Shape test : (186120, 17)\n" ] } ], "source": [ "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)\n", "print(\"Shape train : \", X_train.shape)\n", "print(\"Shape test : \", X_test.shape)" ] }, { "cell_type": "markdown", "id": "74eda066-5e01-43aa-b0cf-cc6d9bbf770e", "metadata": {}, "source": [ "## get results from the logit cross validated model" ] }, { "cell_type": "code", "execution_count": 79, "id": "7c81390e-598c-4f02-bd56-dd03b00dcb33", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelityis_email_trueopt_ingender_femalegender_malegender_othernb_campaignsnb_campaigns_opened
04.01.0100.001.00.05.1771875.1771870.0000000.01TrueFalse1000.00.0
11.01.055.001.00.0426.265613426.2656130.0000000.02TrueTrue0100.00.0
217.01.080.001.00.0436.033437436.0334370.0000000.02TrueTrue1000.00.0
34.01.0120.001.00.05.1964125.1964120.0000000.01TrueFalse1000.00.0
434.02.0416.001.00.0478.693148115.631470363.0616780.04TrueFalse1000.00.0
......................................................
960911.01.067.311.01.0278.442257278.4422570.0000001.02TrueFalse01015.05.0
960921.01.061.411.01.0189.207373189.2073730.0000001.01TrueFalse01012.09.0
960930.00.00.000.00.0550.000000550.000000-1.0000000.01TrueTrue10029.03.0
960941.01.079.431.01.0279.312905279.3129050.0000001.01TrueFalse01020.04.0
960950.00.00.000.00.0550.000000550.000000-1.0000000.02TrueFalse01031.04.0
\n", "

96096 rows × 17 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 4.0 1.0 100.00 1.0 \n", "1 1.0 1.0 55.00 1.0 \n", "2 17.0 1.0 80.00 1.0 \n", "3 4.0 1.0 120.00 1.0 \n", "4 34.0 2.0 416.00 1.0 \n", "... ... ... ... ... \n", "96091 1.0 1.0 67.31 1.0 \n", "96092 1.0 1.0 61.41 1.0 \n", "96093 0.0 0.0 0.00 0.0 \n", "96094 1.0 1.0 79.43 1.0 \n", "96095 0.0 0.0 0.00 0.0 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 0.0 5.177187 5.177187 \n", "1 0.0 426.265613 426.265613 \n", "2 0.0 436.033437 436.033437 \n", "3 0.0 5.196412 5.196412 \n", "4 0.0 478.693148 115.631470 \n", "... ... ... ... \n", "96091 1.0 278.442257 278.442257 \n", "96092 1.0 189.207373 189.207373 \n", "96093 0.0 550.000000 550.000000 \n", "96094 1.0 279.312905 279.312905 \n", "96095 0.0 550.000000 550.000000 \n", "\n", " time_between_purchase nb_tickets_internet fidelity is_email_true \\\n", "0 0.000000 0.0 1 True \n", "1 0.000000 0.0 2 True \n", "2 0.000000 0.0 2 True \n", "3 0.000000 0.0 1 True \n", "4 363.061678 0.0 4 True \n", "... ... ... ... ... \n", "96091 0.000000 1.0 2 True \n", "96092 0.000000 1.0 1 True \n", "96093 -1.000000 0.0 1 True \n", "96094 0.000000 1.0 1 True \n", "96095 -1.000000 0.0 2 True \n", "\n", " opt_in gender_female gender_male gender_other nb_campaigns \\\n", "0 False 1 0 0 0.0 \n", "1 True 0 1 0 0.0 \n", "2 True 1 0 0 0.0 \n", "3 False 1 0 0 0.0 \n", "4 False 1 0 0 0.0 \n", "... ... ... ... ... ... \n", "96091 False 0 1 0 15.0 \n", "96092 False 0 1 0 12.0 \n", "96093 True 1 0 0 29.0 \n", "96094 False 0 1 0 20.0 \n", "96095 False 0 1 0 31.0 \n", "\n", " nb_campaigns_opened \n", "0 0.0 \n", "1 0.0 \n", "2 0.0 \n", "3 0.0 \n", "4 0.0 \n", "... ... \n", "96091 5.0 \n", "96092 9.0 \n", "96093 3.0 \n", "96094 4.0 \n", "96095 4.0 \n", "\n", "[96096 rows x 17 columns]" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test" ] }, { "cell_type": "code", "execution_count": 80, "id": "c708f439-bb75-4688-bf4f-4c04e13deaae", "metadata": {}, "outputs": [], "source": [ "def load_model(type_of_activity, model):\n", " BUCKET = f\"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/\"\n", " filename = model + '.pkl'\n", " file_path = BUCKET + filename\n", " with fs.open(file_path, mode=\"rb\") as f:\n", " model_bytes = f.read()\n", "\n", " model = pickle.loads(model_bytes)\n", " return model" ] }, { "cell_type": "code", "execution_count": 286, "id": "5261a803-05b8-41a0-968c-dc7bde48ddd3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
GridSearchCV(cv=3, error_score='raise',\n",
       "             estimator=Pipeline(steps=[('preprocessor',\n",
       "                                        ColumnTransformer(transformers=[('num',\n",
       "                                                                         Pipeline(steps=[('scaler',\n",
       "                                                                                          StandardScaler())]),\n",
       "                                                                         ['nb_tickets',\n",
       "                                                                          'nb_purchases',\n",
       "                                                                          'total_amount',\n",
       "                                                                          'nb_suppliers',\n",
       "                                                                          'vente_internet_max',\n",
       "                                                                          'purchase_date_min',\n",
       "                                                                          'purchase_date_max',\n",
       "                                                                          'time_between_purchase',\n",
       "                                                                          'nb_tickets_internet',\n",
       "                                                                          'nb_campaigns',\n",
       "                                                                          'nb_...\n",
       "       1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
       "       2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
       "       4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
       "       6.400000e+01]),\n",
       "                         'LogisticRegression_cv__class_weight': ['balanced',\n",
       "                                                                 {0.0: 0.5223906809346011,\n",
       "                                                                  1.0: 11.665359406898034}],\n",
       "                         'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
       "             scoring=make_scorer(recall_score, response_method='predict'))
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "GridSearchCV(cv=3, error_score='raise',\n", " estimator=Pipeline(steps=[('preprocessor',\n", " ColumnTransformer(transformers=[('num',\n", " Pipeline(steps=[('scaler',\n", " StandardScaler())]),\n", " ['nb_tickets',\n", " 'nb_purchases',\n", " 'total_amount',\n", " 'nb_suppliers',\n", " 'vente_internet_max',\n", " 'purchase_date_min',\n", " 'purchase_date_max',\n", " 'time_between_purchase',\n", " 'nb_tickets_internet',\n", " 'nb_campaigns',\n", " 'nb_...\n", " 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n", " 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n", " 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n", " 6.400000e+01]),\n", " 'LogisticRegression_cv__class_weight': ['balanced',\n", " {0.0: 0.5223906809346011,\n", " 1.0: 11.665359406898034}],\n", " 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n", " scoring=make_scorer(recall_score, response_method='predict'))" ] }, "execution_count": 286, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = load_model(type_of_activity, \"LogisticRegression_cv\")\n", "# model = load_model(type_of_activity, \"randomF_cv\")\n", "model" ] }, { "cell_type": "markdown", "id": "006819e7-e9c5-48d9-85ee-aa43d5e4c9c2", "metadata": {}, "source": [ "## Quartile clustering" ] }, { "cell_type": "code", "execution_count": 287, "id": "018d8ff4-3436-4eec-8507-d1a265cbabf1", "metadata": {}, "outputs": [], "source": [ "y_pred = model.predict(X_test)\n", "y_pred_prob = model.predict_proba(X_test)[:, 1]" ] }, { "cell_type": "code", "execution_count": 288, "id": "846f53b9-73c2-4a8b-9d9e-f11bf59ce9ba", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1080/375041546.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X_test_segment[\"has_purchased\"] = y_test\n", "/tmp/ipykernel_1080/375041546.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X_test_segment[\"has_purchased_estim\"] = y_pred\n", "/tmp/ipykernel_1080/375041546.py:5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X_test_segment[\"score\"] = y_pred_prob\n", "/tmp/ipykernel_1080/375041546.py:6: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelity...gender_femalegender_malegender_othernb_campaignsnb_campaigns_openedhas_purchasedhas_purchased_estimscorequartilescore_adjusted
02.01.022.01.01.0307.203553307.2035530.0000002.01...0010.00.00.00.00.36796120.010594
1269.08.050.02.01.0378.20809039.389595338.81849566.010...00165.01.01.01.00.99873140.397108
20.00.00.00.00.0550.000000550.000000-1.0000000.00...0104.02.00.00.00.21199710.014916
30.00.00.00.00.0550.000000550.000000-1.0000000.00...1002.00.00.00.00.24656310.024670
40.00.00.00.00.0550.000000550.000000-1.0000000.00...0014.00.00.00.00.10857510.025205
50.00.00.00.00.0550.000000550.000000-1.0000000.00...1007.00.00.00.00.25724420.046644
60.00.00.00.00.0550.000000550.000000-1.0000000.01...0102.00.00.00.00.20319610.023026
70.00.00.00.00.0550.000000550.000000-1.0000000.00...01010.08.00.00.00.24004910.003825
81.01.011.01.01.0456.255104456.2551040.0000001.01...0013.03.00.00.00.34009820.006850
90.00.00.00.00.0550.000000550.000000-1.0000000.00...01010.06.00.00.00.23447010.003745
\n", "

10 rows × 22 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers vente_internet_max \\\n", "0 2.0 1.0 22.0 1.0 1.0 \n", "1 269.0 8.0 50.0 2.0 1.0 \n", "2 0.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 \n", "5 0.0 0.0 0.0 0.0 0.0 \n", "6 0.0 0.0 0.0 0.0 0.0 \n", "7 0.0 0.0 0.0 0.0 0.0 \n", "8 1.0 1.0 11.0 1.0 1.0 \n", "9 0.0 0.0 0.0 0.0 0.0 \n", "\n", " purchase_date_min purchase_date_max time_between_purchase \\\n", "0 307.203553 307.203553 0.000000 \n", "1 378.208090 39.389595 338.818495 \n", "2 550.000000 550.000000 -1.000000 \n", "3 550.000000 550.000000 -1.000000 \n", "4 550.000000 550.000000 -1.000000 \n", "5 550.000000 550.000000 -1.000000 \n", "6 550.000000 550.000000 -1.000000 \n", "7 550.000000 550.000000 -1.000000 \n", "8 456.255104 456.255104 0.000000 \n", "9 550.000000 550.000000 -1.000000 \n", "\n", " nb_tickets_internet fidelity ... gender_female gender_male \\\n", "0 2.0 1 ... 0 0 \n", "1 66.0 10 ... 0 0 \n", "2 0.0 0 ... 0 1 \n", "3 0.0 0 ... 1 0 \n", "4 0.0 0 ... 0 0 \n", "5 0.0 0 ... 1 0 \n", "6 0.0 1 ... 0 1 \n", "7 0.0 0 ... 0 1 \n", "8 1.0 1 ... 0 0 \n", "9 0.0 0 ... 0 1 \n", "\n", " gender_other nb_campaigns nb_campaigns_opened has_purchased \\\n", "0 1 0.0 0.0 0.0 \n", "1 1 65.0 1.0 1.0 \n", "2 0 4.0 2.0 0.0 \n", "3 0 2.0 0.0 0.0 \n", "4 1 4.0 0.0 0.0 \n", "5 0 7.0 0.0 0.0 \n", "6 0 2.0 0.0 0.0 \n", "7 0 10.0 8.0 0.0 \n", "8 1 3.0 3.0 0.0 \n", "9 0 10.0 6.0 0.0 \n", "\n", " has_purchased_estim score quartile score_adjusted \n", "0 0.0 0.367961 2 0.010594 \n", "1 1.0 0.998731 4 0.397108 \n", "2 0.0 0.211997 1 0.014916 \n", "3 0.0 0.246563 1 0.024670 \n", "4 0.0 0.108575 1 0.025205 \n", "5 0.0 0.257244 2 0.046644 \n", "6 0.0 0.203196 1 0.023026 \n", "7 0.0 0.240049 1 0.003825 \n", "8 0.0 0.340098 2 0.006850 \n", "9 0.0 0.234470 1 0.003745 \n", "\n", "[10 rows x 22 columns]" ] }, "execution_count": 288, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment = X_test\n", "\n", "X_test_segment[\"has_purchased\"] = y_test\n", "X_test_segment[\"has_purchased_estim\"] = y_pred\n", "X_test_segment[\"score\"] = y_pred_prob\n", "X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n", " np.where(X_test['score']<0.5, '2',\n", " np.where(X_test['score']<0.75, '3', '4')))\n", "X_test_segment.head(10)" ] }, { "cell_type": "code", "execution_count": 86, "id": "fb592fe3-ea40-4e83-8fe9-c52b9ee42f2a", "metadata": {}, "outputs": [], "source": [ "def df_segment(df, y, model) :\n", "\n", " y_pred = model.predict(df)\n", " y_pred_prob = model.predict_proba(df)[:, 1]\n", "\n", " df_segment = df\n", "\n", " df_segment[\"has_purchased\"] = y\n", " df_segment[\"has_purchased_estim\"] = y_pred\n", " df_segment[\"score\"] = y_pred_prob\n", " df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n", " np.where(df_segment['score']<0.5, '2',\n", " np.where(df_segment['score']<0.75, '3', '4')))\n", "\n", " return df_segment" ] }, { "cell_type": "code", "execution_count": 88, "id": "968645d5-58cc-485a-bd8b-99f4cfc26fec", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1080/2624515794.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_segment[\"has_purchased\"] = y\n", "/tmp/ipykernel_1080/2624515794.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_segment[\"has_purchased_estim\"] = y_pred\n", "/tmp/ipykernel_1080/2624515794.py:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_segment[\"score\"] = y_pred_prob\n", "/tmp/ipykernel_1080/2624515794.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelity...opt_ingender_femalegender_malegender_othernb_campaignsnb_campaigns_openedhas_purchasedhas_purchased_estimscorequartile
04.01.0100.001.00.05.1771875.1771870.0000000.01...False1000.00.00.00.00.0060661
11.01.055.001.00.0426.265613426.2656130.0000000.02...True0100.00.01.00.00.2888472
217.01.080.001.00.0436.033437436.0334370.0000000.02...True1000.00.00.00.00.1032641
34.01.0120.001.00.05.1964125.1964120.0000000.01...False1000.00.00.00.00.0089281
434.02.0416.001.00.0478.693148115.631470363.0616780.04...False1000.00.01.01.00.9928094
..................................................................
960911.01.067.311.01.0278.442257278.4422570.0000001.02...False01015.05.01.00.00.3517622
960921.01.061.411.01.0189.207373189.2073730.0000001.01...False01012.09.00.01.00.5678143
960930.00.00.000.00.0550.000000550.000000-1.0000000.01...True10029.03.00.00.00.0046521
960941.01.079.431.01.0279.312905279.3129050.0000001.01...False01020.04.00.00.00.2930422
960950.00.00.000.00.0550.000000550.000000-1.0000000.02...False01031.04.00.01.00.7878524
\n", "

96096 rows × 21 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 4.0 1.0 100.00 1.0 \n", "1 1.0 1.0 55.00 1.0 \n", "2 17.0 1.0 80.00 1.0 \n", "3 4.0 1.0 120.00 1.0 \n", "4 34.0 2.0 416.00 1.0 \n", "... ... ... ... ... \n", "96091 1.0 1.0 67.31 1.0 \n", "96092 1.0 1.0 61.41 1.0 \n", "96093 0.0 0.0 0.00 0.0 \n", "96094 1.0 1.0 79.43 1.0 \n", "96095 0.0 0.0 0.00 0.0 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 0.0 5.177187 5.177187 \n", "1 0.0 426.265613 426.265613 \n", "2 0.0 436.033437 436.033437 \n", "3 0.0 5.196412 5.196412 \n", "4 0.0 478.693148 115.631470 \n", "... ... ... ... \n", "96091 1.0 278.442257 278.442257 \n", "96092 1.0 189.207373 189.207373 \n", "96093 0.0 550.000000 550.000000 \n", "96094 1.0 279.312905 279.312905 \n", "96095 0.0 550.000000 550.000000 \n", "\n", " time_between_purchase nb_tickets_internet fidelity ... opt_in \\\n", "0 0.000000 0.0 1 ... False \n", "1 0.000000 0.0 2 ... True \n", "2 0.000000 0.0 2 ... True \n", "3 0.000000 0.0 1 ... False \n", "4 363.061678 0.0 4 ... False \n", "... ... ... ... ... ... \n", "96091 0.000000 1.0 2 ... False \n", "96092 0.000000 1.0 1 ... False \n", "96093 -1.000000 0.0 1 ... True \n", "96094 0.000000 1.0 1 ... False \n", "96095 -1.000000 0.0 2 ... False \n", "\n", " gender_female gender_male gender_other nb_campaigns \\\n", "0 1 0 0 0.0 \n", "1 0 1 0 0.0 \n", "2 1 0 0 0.0 \n", "3 1 0 0 0.0 \n", "4 1 0 0 0.0 \n", "... ... ... ... ... \n", "96091 0 1 0 15.0 \n", "96092 0 1 0 12.0 \n", "96093 1 0 0 29.0 \n", "96094 0 1 0 20.0 \n", "96095 0 1 0 31.0 \n", "\n", " nb_campaigns_opened has_purchased has_purchased_estim score \\\n", "0 0.0 0.0 0.0 0.006066 \n", "1 0.0 1.0 0.0 0.288847 \n", "2 0.0 0.0 0.0 0.103264 \n", "3 0.0 0.0 0.0 0.008928 \n", "4 0.0 1.0 1.0 0.992809 \n", "... ... ... ... ... \n", "96091 5.0 1.0 0.0 0.351762 \n", "96092 9.0 0.0 1.0 0.567814 \n", "96093 3.0 0.0 0.0 0.004652 \n", "96094 4.0 0.0 0.0 0.293042 \n", "96095 4.0 0.0 1.0 0.787852 \n", "\n", " quartile \n", "0 1 \n", "1 2 \n", "2 1 \n", "3 1 \n", "4 4 \n", "... ... \n", "96091 2 \n", "96092 3 \n", "96093 1 \n", "96094 2 \n", "96095 4 \n", "\n", "[96096 rows x 21 columns]" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_segment(X_test, y_test, model)" ] }, { "cell_type": "markdown", "id": "ad16b8ab-7e01-404b-971e-866e9b9d5aa4", "metadata": {}, "source": [ "## definition of functions to compute the bias of scores and adjust it \n", "\n", "Le biais est calculé de la façon suivante. \n", "En notant $\\hat{p(x_i)}$ le score calculé (estimé par la modélisation) et $p(x_i)$ le vrai score (sans biais), et $\\beta$ le logarithme du biais, on a : \\\n", "$\\ln{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}} = \\beta + \\ln{\\frac{p(x_i)}{1-p(x_i)}}$ \\\n", "$ \\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}} = \\exp(\\beta) . \\frac{p(x_i)}{1-p(x_i)} $ , soit : \\\n", "$p(x_i) = {\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}}$ \\\n", "Ce qu'on appelle biais et qu'on estime dans le code par la suite est : $B=\\exp(\\beta) $. Les probabilités ne sont donc pas biaisées si $B=1$. Il y a surestimation si $B>1$. \n", "\n", "On cherche le B qui permette d'ajuster les probabilités de telle sorte que la somme des scores soit égale à la somme des y_has_purchased. Cela revient à résoudre : \n", "\n", "\\begin{equation}\n", "\\sum_{i}{\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}} = \\sum_{i}{Y_i}\n", "\\end{equation}\n", "\n", "C'est ce que fait la fonction find_bias. \n", "\n", "Note sur les notations : \\\n", "$\\hat{p(x_i)}$ correspond à ce qu'on appelle le score et $p(x_i)$ à ce qu'on appellera le score adjusted" ] }, { "cell_type": "code", "execution_count": 89, "id": "f0379536-a6c5-4b16-bde5-d0319ec1b140", "metadata": {}, "outputs": [], "source": [ "# compute adjusted score from odd ratios (cf formula above)\n", "def adjusted_score(odd_ratio, bias) :\n", " adjusted_score = odd_ratio/(bias+odd_ratio)\n", " return adjusted_score" ] }, { "cell_type": "code", "execution_count": 90, "id": "32a0dfd0-f49d-4785-a56f-706d381bfe41", "metadata": {}, "outputs": [], "source": [ "# when the score is 1 we cannot compute the odd ratio, so we adjust scores equal to 1\n", "# we set the second best score instead\n", "\n", "def adjust_score_1(score) :\n", " second_best_score = np.array([element for element in score if element !=1]).max()\n", " new_score = np.array([element if element!=1 else second_best_score for element in score]) \n", " return new_score" ] }, { "cell_type": "code", "execution_count": 91, "id": "2dff1def-02df-413e-afce-b4aeaf7752b6", "metadata": {}, "outputs": [], "source": [ "def odd_ratio(score) :\n", " return score / (1 - score)" ] }, { "cell_type": "code", "execution_count": 92, "id": "683d71fc-7442-4028-869c-49c57592d6e9", "metadata": {}, "outputs": [], "source": [ "# definition of a function that automatically detects the bias\n", "\n", "def find_bias(odd_ratios, y_objective, initial_guess=6) :\n", " \"\"\"\n", " results = minimize(lambda bias : (sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective)**2 ,\n", " initial_guess , method = \"BFGS\")\n", "\n", " estimated_bias = results.x[0]\n", " \"\"\"\n", "\n", " # faster method\n", " bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n", " \n", " return bias_estimated[0]" ] }, { "cell_type": "code", "execution_count": 289, "id": "f17dc6ca-7a48-441b-8c04-11c47b8b9741", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.3940650533525649 0.04284869976359338\n" ] }, { "data": { "text/plain": [ "0.04286194557403322" ] }, "execution_count": 289, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(X_test_segment[\"score\"].mean(), y_test[\"y_has_purchased\"].mean())\n", "y_train[\"y_has_purchased\"].mean()" ] }, { "cell_type": "code", "execution_count": 290, "id": "781b0d40-c954-4c54-830a-e709c8667328", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "22.577005337484817" ] }, "execution_count": 290, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# computation with the function defined\n", "\n", "bias_test_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_test_segment[\"score\"])), \n", " y_objective = y_test[\"y_has_purchased\"].sum(),\n", " initial_guess=6)\n", "bias_test_set" ] }, { "cell_type": "code", "execution_count": 291, "id": "248cb862-418e-4767-9933-70c4885ecf40", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "22.690061493186622" ] }, "execution_count": 291, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# comparison with bias of the train set\n", "X_train_score = model.predict_proba(X_train)[:, 1]\n", "\n", "bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), \n", " y_objective = y_train[\"y_has_purchased\"].sum(),\n", " initial_guess=6)\n", "bias_train_set" ] }, { "cell_type": "code", "execution_count": 292, "id": "fff6cbe6-7bb3-4732-9b81-b9ac5383bbcf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "betâ test - betâ train = -0.0049950835646278635\n" ] } ], "source": [ "print(\"betâ test - betâ train = \",np.log(bias_test_set/bias_train_set))" ] }, { "cell_type": "code", "execution_count": 293, "id": "f506870d-4a8a-4b2c-8f0b-e0789080b20c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mean absolute erreur 0.00017894295558797563\n" ] } ], "source": [ "# impact of considering a bias computed on train set instead of test set - totally neglectable\n", "\n", "score_adjusted_test = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n", "score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n", "\n", "print(\"mean absolute erreur\",abs(score_adjusted_test-score_adjusted_train).mean())" ] }, { "cell_type": "code", "execution_count": 294, "id": "8213d0e4-063b-49fa-90b7-677fc34f4c01", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1080/1825363704.py:7: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " X_test_segment[\"score_adjusted\"] = score_adjusted_train\n" ] } ], "source": [ "# adjust scores accordingly \n", "\n", "# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n", "\n", "# actually, we are not supposed to have X_test, so the biais is estimated on X_train\n", "# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n", "X_test_segment[\"score_adjusted\"] = score_adjusted_train" ] }, { "cell_type": "code", "execution_count": 295, "id": "834d3723-2e72-4c65-9c62-e2d595c69461", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE for score : 0.18391062438077188\n", "MSE for ajusted score : 0.037093800862222845\n", "sum of y_has_purchased : 7975.0\n", "sum of adjusted scores : 7941.695137104767\n" ] } ], "source": [ "# check \n", "\n", "MSE_score = ((X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"])**2).mean()\n", "MSE_ajusted_score = ((X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"])**2).mean()\n", "print(f\"MSE for score : {MSE_score}\")\n", "print(f\"MSE for ajusted score : {MSE_ajusted_score}\")\n", "\n", "print(\"sum of y_has_purchased :\",y_test[\"y_has_purchased\"].sum())\n", "print(\"sum of adjusted scores :\", X_test_segment[\"score_adjusted\"].sum())" ] }, { "cell_type": "code", "execution_count": 296, "id": "9f30a4dd-a9d8-405a-a7d5-5324ae88cf70", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MAE for score : 0.38422988971624206\n", "MAE for adjusted score : 0.07284616452278603\n" ] } ], "source": [ "# mean absolute error - divided by 2 with out method\n", "\n", "MAE_score = abs(X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"]).mean()\n", "MAE_ajusted_score = abs(X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"]).mean()\n", "print(f\"MAE for score : {MAE_score}\")\n", "print(f\"MAE for adjusted score : {MAE_ajusted_score}\")" ] }, { "cell_type": "code", "execution_count": 103, "id": "6f9396db-e213-408c-a596-eaeec3bc79f3", "metadata": {}, "outputs": [], "source": [ "# visualization\n", "\n", "# histogramme des probas et des probas ajustées\n", "\n", "def plot_hist_scores(df, score, score_adjusted, type_of_activity) :\n", "\n", " plt.figure()\n", " plt.hist(df[score], label = \"score\", alpha=0.6)\n", " plt.hist(df[score_adjusted], label=\"adjusted score\", alpha=0.6)\n", " plt.legend()\n", " plt.xlabel(\"probability of a future purchase\")\n", " plt.ylabel(\"count\")\n", " plt.title(f\"Comparison between score and adjusted score for {type_of_activity} companies\")\n", " # plt.show()" ] }, { "cell_type": "code", "execution_count": 64, "id": "def64c16-f4dd-493c-909c-d886d7f53947", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'projet-bdc2324-team1/Output_expected_CA/sport/hist_score_adjustedsport.png'" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PATH + file_name + type_of_activity + \".png\"" ] }, { "cell_type": "code", "execution_count": 297, "id": "b478d40d-9677-4204-87bd-16fb0bc1fe9a", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\", type_of_activity = type_of_activity)" ] }, { "cell_type": "code", "execution_count": 40, "id": "add631d7-0757-45a5-bb5b-f7f4b4baa961", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "projet-bdc2324-team1/Output_expected_CA/sport/\n" ] } ], "source": [ "# define path so save graphics\n", "\n", "# define type of activity \n", "type_of_activity = \"sport\"\n", "PATH = f\"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/\"\n", "print(PATH)" ] }, { "cell_type": "code", "execution_count": 68, "id": "3a5b5bd9-e033-4436-8c56-bf5fb61df87f", "metadata": {}, "outputs": [], "source": [ "# export png \n", "\n", "# plot adjusted scores and save (to be tested)\n", "plot_hist_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\", type_of_activity = type_of_activity)\n", "\n", "image_buffer = io.BytesIO()\n", "plt.savefig(image_buffer, format='png')\n", "image_buffer.seek(0)\n", "file_name = \"hist_score_adjusted_\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".png\"\n", "with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:\n", " s3_file.write(image_buffer.read())\n", "plt.close()" ] }, { "cell_type": "markdown", "id": "e6fae260-fab8-4f51-90dc-9b6d7314c77b", "metadata": {}, "source": [ "## Compute number of tickets and CA by segment with the recalibrated score" ] }, { "cell_type": "code", "execution_count": 298, "id": "90c4c2b5-0ede-4001-889f-749cfbd9df04", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quartilescore (%)score adjusted (%)has purchased (%)
0117.780.960.67
1236.122.492.83
2363.147.297.04
3486.0329.2129.20
\n", "
" ], "text/plain": [ " quartile score (%) score adjusted (%) has purchased (%)\n", "0 1 17.78 0.96 0.67\n", "1 2 36.12 2.49 2.83\n", "2 3 63.14 7.29 7.04\n", "3 4 86.03 29.21 29.20" ] }, "execution_count": 298, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_table_adjusted_scores = (100 * X_test_segment.groupby(\"quartile\")[[\"score\",\"score_adjusted\", \"has_purchased\"]].mean()).round(2).reset_index()\n", "X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f\"{col.replace('_', ' ')} (%)\" for col in X_test_table_adjusted_scores.columns if col in [\"score\",\"score_adjusted\", \"has_purchased\"]})\n", "X_test_table_adjusted_scores" ] }, { "cell_type": "code", "execution_count": 162, "id": "d0b8740c-cf48-4a3e-83cb-23d95059f62f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\\\begin{tabular}{lrrr}\\n\\\\toprule\\nquartile & score (%) & score adjusted (%) & has purchased (%) \\\\\\\\\\n\\\\midrule\\n1 & 13.250000 & 2.510000 & 1.570000 \\\\\\\\\\n2 & 33.890000 & 8.000000 & 9.850000 \\\\\\\\\\n3 & 63.060000 & 22.580000 & 21.470000 \\\\\\\\\\n4 & 90.520000 & 66.200000 & 65.010000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'" ] }, "execution_count": 162, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_table_adjusted_scores.to_latex(index=False)" ] }, { "cell_type": "code", "execution_count": 43, "id": "d6a04d3e-c454-43e4-ae4c-0746e928575b", "metadata": {}, "outputs": [], "source": [ "# comparison between score and adjusted score - export csv associated\n", "\n", "file_name = \"table_adjusted_score_\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n", "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", " X_test_table_adjusted_scores.to_csv(file_out, index = False)" ] }, { "cell_type": "code", "execution_count": 106, "id": "a974589f-7952-4db2-bebf-7b69c6b09372", "metadata": {}, "outputs": [], "source": [ "def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :\n", " \n", " duration_ratio = duration_ref/duration_projection\n", "\n", " df_output = df\n", "\n", " df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n", " df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n", " \n", " df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n", " df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n", "\n", " df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n", " \n", " return df_output\n" ] }, { "cell_type": "code", "execution_count": 107, "id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1080/3982240549.py:7: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n", "/tmp/ipykernel_1080/3982240549.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n", "/tmp/ipykernel_1080/3982240549.py:10: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n", "/tmp/ipykernel_1080/3982240549.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n", "/tmp/ipykernel_1080/3982240549.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchasenb_tickets_internetfidelity...has_purchasedhas_purchased_estimscorequartilescore_adjustednb_tickets_projectedtotal_amount_projectednb_tickets_expectedtotal_amount_expectedpace_purchase
04.01.0100.001.00.05.1771875.1771870.0000000.01...0.00.00.00606610.0017132.82352970.5882350.0048360.12089017.0
11.01.055.001.00.0426.265613426.2656130.0000000.02...1.00.00.28884720.1024770.70588238.8235290.0723373.97852017.0
217.01.080.001.00.0436.033437436.0334370.0000000.02...0.00.00.10326410.03135612.00000056.4705880.3762741.77070117.0
34.01.0120.001.00.05.1964125.1964120.0000000.01...0.00.00.00892810.0025262.82352984.7058820.0071320.21396817.0
434.02.0416.001.00.0478.693148115.631470363.0616780.04...1.01.00.99280940.97488024.000000293.64705923.397112286.2705418.5
..................................................................
960911.01.067.311.01.0278.442257278.4422570.0000001.02...1.00.00.35176220.1323530.70588247.5129410.0934266.28847817.0
960921.01.061.411.01.0189.207373189.2073730.0000001.01...0.01.00.56781430.2697140.70588243.3482350.19038711.69164517.0
960930.00.00.000.00.0550.000000550.000000-1.0000000.01...0.00.00.00465210.0013120.0000000.0000000.0000000.000000NaN
960941.01.079.431.01.0279.312905279.3129050.0000001.01...0.00.00.29304220.1043620.70588256.0682350.0736685.85142017.0
960950.00.00.000.00.0550.000000550.000000-1.0000000.02...0.01.00.78785240.5107530.0000000.0000000.0000000.000000NaN
\n", "

96096 rows × 27 columns

\n", "
" ], "text/plain": [ " nb_tickets nb_purchases total_amount nb_suppliers \\\n", "0 4.0 1.0 100.00 1.0 \n", "1 1.0 1.0 55.00 1.0 \n", "2 17.0 1.0 80.00 1.0 \n", "3 4.0 1.0 120.00 1.0 \n", "4 34.0 2.0 416.00 1.0 \n", "... ... ... ... ... \n", "96091 1.0 1.0 67.31 1.0 \n", "96092 1.0 1.0 61.41 1.0 \n", "96093 0.0 0.0 0.00 0.0 \n", "96094 1.0 1.0 79.43 1.0 \n", "96095 0.0 0.0 0.00 0.0 \n", "\n", " vente_internet_max purchase_date_min purchase_date_max \\\n", "0 0.0 5.177187 5.177187 \n", "1 0.0 426.265613 426.265613 \n", "2 0.0 436.033437 436.033437 \n", "3 0.0 5.196412 5.196412 \n", "4 0.0 478.693148 115.631470 \n", "... ... ... ... \n", "96091 1.0 278.442257 278.442257 \n", "96092 1.0 189.207373 189.207373 \n", "96093 0.0 550.000000 550.000000 \n", "96094 1.0 279.312905 279.312905 \n", "96095 0.0 550.000000 550.000000 \n", "\n", " time_between_purchase nb_tickets_internet fidelity ... \\\n", "0 0.000000 0.0 1 ... \n", "1 0.000000 0.0 2 ... \n", "2 0.000000 0.0 2 ... \n", "3 0.000000 0.0 1 ... \n", "4 363.061678 0.0 4 ... \n", "... ... ... ... ... \n", "96091 0.000000 1.0 2 ... \n", "96092 0.000000 1.0 1 ... \n", "96093 -1.000000 0.0 1 ... \n", "96094 0.000000 1.0 1 ... \n", "96095 -1.000000 0.0 2 ... \n", "\n", " has_purchased has_purchased_estim score quartile score_adjusted \\\n", "0 0.0 0.0 0.006066 1 0.001713 \n", "1 1.0 0.0 0.288847 2 0.102477 \n", "2 0.0 0.0 0.103264 1 0.031356 \n", "3 0.0 0.0 0.008928 1 0.002526 \n", "4 1.0 1.0 0.992809 4 0.974880 \n", "... ... ... ... ... ... \n", "96091 1.0 0.0 0.351762 2 0.132353 \n", "96092 0.0 1.0 0.567814 3 0.269714 \n", "96093 0.0 0.0 0.004652 1 0.001312 \n", "96094 0.0 0.0 0.293042 2 0.104362 \n", "96095 0.0 1.0 0.787852 4 0.510753 \n", "\n", " nb_tickets_projected total_amount_projected nb_tickets_expected \\\n", "0 2.823529 70.588235 0.004836 \n", "1 0.705882 38.823529 0.072337 \n", "2 12.000000 56.470588 0.376274 \n", "3 2.823529 84.705882 0.007132 \n", "4 24.000000 293.647059 23.397112 \n", "... ... ... ... \n", "96091 0.705882 47.512941 0.093426 \n", "96092 0.705882 43.348235 0.190387 \n", "96093 0.000000 0.000000 0.000000 \n", "96094 0.705882 56.068235 0.073668 \n", "96095 0.000000 0.000000 0.000000 \n", "\n", " total_amount_expected pace_purchase \n", "0 0.120890 17.0 \n", "1 3.978520 17.0 \n", "2 1.770701 17.0 \n", "3 0.213968 17.0 \n", "4 286.270541 8.5 \n", "... ... ... \n", "96091 6.288478 17.0 \n", "96092 11.691645 17.0 \n", "96093 0.000000 NaN \n", "96094 5.851420 17.0 \n", "96095 0.000000 NaN \n", "\n", "[96096 rows x 27 columns]" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment = project_tickets_CA (X_test_segment, \"nb_purchases\", \"nb_tickets\", \"total_amount\", \"score_adjusted\", \n", " duration_ref=17, duration_projection=12)\n", "X_test_segment" ] }, { "cell_type": "code", "execution_count": 108, "id": "cb66a8ea-65f7-460f-b3fc-ba76a3b91faa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "quartile\n", "1 15.578346\n", "2 15.403993\n", "3 12.415869\n", "4 5.983541\n", "Name: pace_purchase, dtype: float64" ] }, "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment.groupby(\"quartile\")[\"pace_purchase\"].mean()" ] }, { "cell_type": "code", "execution_count": 109, "id": "f58f9151-2f91-45df-abb7-1ddcf0652adc", "metadata": {}, "outputs": [], "source": [ "# generalization with a function\n", "\n", "def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,\n", " duration_ref=17, duration_projection=12) :\n", " \n", " # compute nb tickets estimated and total amount expected\n", " df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()\n", " \n", " # number of customers by segment\n", " df_expected_CA.insert(1, \"size\", df.groupby(segment).size().values)\n", " \n", " # size in percent of all customers\n", " df_expected_CA.insert(2, \"size_perct\", 100 * df_expected_CA[\"size\"]/df_expected_CA[\"size\"].sum())\n", " \n", " # compute share of CA recovered\n", " duration_ratio=duration_ref/duration_projection\n", " \n", " df_expected_CA[\"revenue_recovered_perct\"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \\\n", " df.groupby(segment)[total_amount].sum().values\n", "\n", " df_drop_null_pace = df.dropna(subset=[pace_purchase])\n", " df_expected_CA[\"pace_purchase\"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values\n", " \n", " return df_expected_CA" ] }, { "cell_type": "code", "execution_count": 110, "id": "c8df6c80-43e8-4f00-9cd3-eb9022744313", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quartilesizesize_perctnb_tickets_expectedtotal_amount_expectedrevenue_recovered_perctpace_purchase
015356555.741067.9127626.393.4215.58
122460725.614748.18169941.7215.2715.40
23971610.1111629.33309933.7932.4112.42
3482088.54215729.8610042427.5089.695.98
\n", "
" ], "text/plain": [ " quartile size size_perct nb_tickets_expected total_amount_expected \\\n", "0 1 53565 55.74 1067.91 27626.39 \n", "1 2 24607 25.61 4748.18 169941.72 \n", "2 3 9716 10.11 11629.33 309933.79 \n", "3 4 8208 8.54 215729.86 10042427.50 \n", "\n", " revenue_recovered_perct pace_purchase \n", "0 3.42 15.58 \n", "1 15.27 15.40 \n", "2 32.41 12.42 \n", "3 89.69 5.98 " ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment=\"quartile\", \n", " nb_tickets_expected=\"nb_tickets_expected\", total_amount_expected=\"total_amount_expected\", \n", " total_amount=\"total_amount\", pace_purchase=\"pace_purchase\"),2)\n", "\n", "X_test_expected_CA" ] }, { "cell_type": "code", "execution_count": 64, "id": "ac706ed7-defa-4df1-82e1-06f12fc1b6ad", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\\\begin{tabular}{lrrrrrr}\\n\\\\toprule\\nquartile & size & size (%) & nb tickets expected & total amount expected & revenue recovered (%) & pace purchase \\\\\\\\\\n\\\\midrule\\n1 & 53626 & 35.310000 & 398.260000 & 13949.330000 & 2.350000 & 16.480000 \\\\\\\\\\n2 & 55974 & 36.860000 & 3113.770000 & 101639.450000 & 6.240000 & 16.470000 \\\\\\\\\\n3 & 30435 & 20.040000 & 6214.350000 & 208267.220000 & 14.270000 & 15.710000 \\\\\\\\\\n4 & 11839 & 7.800000 & 72929.460000 & 1835702.430000 & 75.380000 & 11.480000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Création du dictionnaire de mapping pour les noms de colonnes\n", "mapping_dict = {col: col.replace(\"perct\", \"(%)\").replace(\"_\", \" \") for col in X_test_expected_CA.columns}\n", "\n", "X_test_expected_CA.rename(columns=mapping_dict).to_latex(index=False)" ] }, { "cell_type": "code", "execution_count": 122, "id": "771da0cf-c49f-4e7e-b52f-ebcfb0fb2df3", "metadata": {}, "outputs": [], "source": [ "# export summary table to the MinIO storage\n", "\n", "file_name = \"table_expected_CA_\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n", "with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", " X_test_expected_CA.to_csv(file_out, index = False)" ] }, { "cell_type": "code", "execution_count": 53, "id": "c805dc10-4d07-4f7d-a677-5461a92845d7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'projet-bdc2324-team1/Output_expected_CA/musique/table_expected_CA_musique.csv'" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PATH = f\"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/\"\n", "file_name = \"table_expected_CA_\"\n", "FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n", "FILE_PATH_OUT_S3" ] }, { "cell_type": "markdown", "id": "e35ccfff-1845-41f0-9bde-f09b09b67877", "metadata": {}, "source": [ "## Test : vizu tables saved" ] }, { "cell_type": "code", "execution_count": 66, "id": "4e9e88e4-ea10-41f4-9bf1-20b55269a20d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quartilescore (%)score adjusted (%)has purchased (%)
0113.252.511.57
1233.898.009.85
2363.0622.5821.47
3490.5266.2065.01
\n", "
" ], "text/plain": [ " quartile score (%) score adjusted (%) has purchased (%)\n", "0 1 13.25 2.51 1.57\n", "1 2 33.89 8.00 9.85\n", "2 3 63.06 22.58 21.47\n", "3 4 90.52 66.20 65.01" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path = 'projet-bdc2324-team1/Output_expected_CA/sport/table_adjusted_scoresport.csv'\n", "\n", "with fs.open( path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in, sep=\",\")\n", "df" ] }, { "cell_type": "markdown", "id": "9c471bdd-25c2-420a-a8a1-3add9f003cbc", "metadata": {}, "source": [ "## Just to try, same computation with score instead of score adjusted\n", "\n", "seems overestimated : if only 14% of customers come back, how can we recover 22% of the revenue from the segment that is least likely to buy ?? ..." ] }, { "cell_type": "code", "execution_count": 80, "id": "53684a24-1809-465f-8e21-b9295e34582a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
quartilesizesize_perctnb_tickets_expectedtotal_amount_expectedperct_revenue_recovered
013741038.93419.769245.0821.71
122951730.7211549.06296522.0239.24
232013720.9629997.85954751.9163.34
3490329.40244655.8210736011.9597.72
\n", "
" ], "text/plain": [ " quartile size size_perct nb_tickets_expected total_amount_expected \\\n", "0 1 37410 38.93 419.76 9245.08 \n", "1 2 29517 30.72 11549.06 296522.02 \n", "2 3 20137 20.96 29997.85 954751.91 \n", "3 4 9032 9.40 244655.82 10736011.95 \n", "\n", " perct_revenue_recovered \n", "0 21.71 \n", "1 39.24 \n", "2 63.34 \n", "3 97.72 " ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment_bis = project_tickets_CA (X_test_segment, \"nb_tickets\", \"total_amount\", \"score\", duration_ref=1.5, duration_projection=1)\n", "\n", "X_test_expected_CA_bis = round(summary_expected_CA(df=X_test_segment_bis, segment=\"quartile\", nb_tickets_expected=\"nb_tickets_expected\", \n", " total_amount_expected=\"total_amount_expected\", total_amount=\"total_amount\"),2)\n", "\n", "X_test_expected_CA_bis" ] }, { "cell_type": "code", "execution_count": 81, "id": "7dc66d1e-da03-4513-96e4-d9a43ac0a2c8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "overall share of revenue recovered : 90.26 %\n" ] } ], "source": [ "print(\"overall share of revenue recovered : \", round(100 * duration_ratio * X_test_expected_CA_bis[\"total_amount_expected\"].sum() / \\\n", "X_test_segment_bis[\"total_amount\"].sum(),2), \"%\")" ] }, { "cell_type": "markdown", "id": "673f2969-7b9a-44c1-abf5-5679fca877ce", "metadata": {}, "source": [ "## Last pieces of analysis" ] }, { "cell_type": "code", "execution_count": 161, "id": "2365bb13-0f3f-49d5-bf91-52c92abebcee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "overall share of revenue recovered : 77.64%\n" ] } ], "source": [ "# global revenue recovered\n", "global_revenue_recovered = round(100 * duration_ratio * X_test_expected_CA[\"total_amount_expected\"].sum() / \\\n", "X_test_segment[\"total_amount\"].sum(),2)\n", "print(f\"overall share of revenue recovered : {global_revenue_recovered}%\")" ] }, { "cell_type": "code", "execution_count": 163, "id": "16b17f35-57dd-459a-8989-129143dc0952", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.018093\n", "1 0.721519\n", "2 3.336101\n", "3 95.924287\n", "Name: total_amount_expected, dtype: float64" ] }, "execution_count": 163, "metadata": {}, "output_type": "execute_result" } ], "source": [ "100 * X_test_expected_CA[\"total_amount_expected\"]/X_test_expected_CA[\"total_amount_expected\"].sum()" ] }, { "cell_type": "code", "execution_count": 166, "id": "dee4a200-eefe-4377-8e80-59ad33edd3c0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "quartile\n", "1 0.320407\n", "2 5.685020\n", "3 11.339715\n", "4 82.654858\n", "Name: total_amount, dtype: float64" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# le segment 4 représente 83% du CA actuel et 96% du CA lié aux anciens clients pour l'année prochaine\n", "100 * X_test_segment.groupby(\"quartile\")[\"total_amount\"].sum()/X_test_segment[\"total_amount\"].sum()" ] }, { "cell_type": "code", "execution_count": 177, "id": "c1e6f020-ef18-40b4-bfc1-19f98cb2796e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 96096.000000\n", "mean 207.475735\n", "std 4720.046248\n", "min -48831.800000\n", "25% 0.000000\n", "50% 0.000000\n", "75% 60.000000\n", "max 624890.000000\n", "Name: total_amount, dtype: float64" ] }, "execution_count": 177, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_test_segment[\"total_amount\"].describe() # total amount négatif ???\n" ] }, { "cell_type": "code", "execution_count": 184, "id": "d301a50e-7c68-40f0-9245-a4eea64c387b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 -4.883180e+04\n", "1 -6.483180e+04\n", "2 -7.683860e+04\n", "3 -8.683860e+04\n", "4 -9.683860e+04\n", " ... \n", "96091 1.802247e+07\n", "96092 1.839238e+07\n", "96093 1.877219e+07\n", "96094 1.931270e+07\n", "96095 1.993759e+07\n", "Name: total_amount, Length: 96096, dtype: float64" ] }, "execution_count": 184, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.cumsum(X_test_segment[\"total_amount\"].sort_values()).reset_index()[\"total_amount\"]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }