In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"
],
"text/plain": [
"GridSearchCV(cv=3, error_score='raise',\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5837086520288036,\n",
" 1.0: 3.486549107420539}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
" scoring=make_scorer(recall_score, response_method='predict'))"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logit_cv"
]
},
{
"cell_type": "markdown",
"id": "006819e7-e9c5-48d9-85ee-aa43d5e4c9c2",
"metadata": {},
"source": [
"## Quartile clustering"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "018d8ff4-3436-4eec-8507-d1a265cbabf1",
"metadata": {},
"outputs": [],
"source": [
"y_pred = logit_cv.predict(X_test)\n",
"y_pred_prob = logit_cv.predict_proba(X_test)[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "846f53b9-73c2-4a8b-9d9e-f11bf59ce9ba",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_620/375041546.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"has_purchased\"] = y_test\n",
"/tmp/ipykernel_620/375041546.py:4: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"has_purchased_estim\"] = y_pred\n",
"/tmp/ipykernel_620/375041546.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"score\"] = y_pred_prob\n",
"/tmp/ipykernel_620/375041546.py:6: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
nb_tickets
\n",
"
nb_purchases
\n",
"
total_amount
\n",
"
nb_suppliers
\n",
"
vente_internet_max
\n",
"
purchase_date_min
\n",
"
purchase_date_max
\n",
"
time_between_purchase
\n",
"
nb_tickets_internet
\n",
"
fidelity
\n",
"
...
\n",
"
opt_in
\n",
"
gender_female
\n",
"
gender_male
\n",
"
gender_other
\n",
"
nb_campaigns
\n",
"
nb_campaigns_opened
\n",
"
has_purchased
\n",
"
has_purchased_estim
\n",
"
score
\n",
"
quartile
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
4.0
\n",
"
1.0
\n",
"
100.0
\n",
"
1.0
\n",
"
0.0
\n",
"
5.177187
\n",
"
5.177187
\n",
"
0.000000
\n",
"
0.0
\n",
"
1
\n",
"
...
\n",
"
False
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
1.0
\n",
"
0.657671
\n",
"
3
\n",
"
\n",
"
\n",
"
1
\n",
"
1.0
\n",
"
1.0
\n",
"
55.0
\n",
"
1.0
\n",
"
0.0
\n",
"
426.265613
\n",
"
426.265613
\n",
"
0.000000
\n",
"
0.0
\n",
"
2
\n",
"
...
\n",
"
True
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
1.0
\n",
"
0.0
\n",
"
0.266538
\n",
"
2
\n",
"
\n",
"
\n",
"
2
\n",
"
17.0
\n",
"
1.0
\n",
"
80.0
\n",
"
1.0
\n",
"
0.0
\n",
"
436.033437
\n",
"
436.033437
\n",
"
0.000000
\n",
"
0.0
\n",
"
2
\n",
"
...
\n",
"
True
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.214668
\n",
"
1
\n",
"
\n",
"
\n",
"
3
\n",
"
4.0
\n",
"
1.0
\n",
"
120.0
\n",
"
1.0
\n",
"
0.0
\n",
"
5.196412
\n",
"
5.196412
\n",
"
0.000000
\n",
"
0.0
\n",
"
1
\n",
"
...
\n",
"
False
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
1.0
\n",
"
0.657770
\n",
"
3
\n",
"
\n",
"
\n",
"
4
\n",
"
34.0
\n",
"
2.0
\n",
"
416.0
\n",
"
1.0
\n",
"
0.0
\n",
"
478.693148
\n",
"
115.631470
\n",
"
363.061678
\n",
"
0.0
\n",
"
4
\n",
"
...
\n",
"
False
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
1.0
\n",
"
1.0
\n",
"
0.894173
\n",
"
4
\n",
"
\n",
"
\n",
"
5
\n",
"
2.0
\n",
"
1.0
\n",
"
60.0
\n",
"
1.0
\n",
"
0.0
\n",
"
5.140069
\n",
"
5.140069
\n",
"
0.000000
\n",
"
0.0
\n",
"
1
\n",
"
...
\n",
"
False
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
1.0
\n",
"
0.717482
\n",
"
3
\n",
"
\n",
"
\n",
"
6
\n",
"
5.0
\n",
"
1.0
\n",
"
61.0
\n",
"
1.0
\n",
"
1.0
\n",
"
105.053773
\n",
"
105.053773
\n",
"
0.000000
\n",
"
5.0
\n",
"
1
\n",
"
...
\n",
"
False
\n",
"
0
\n",
"
0
\n",
"
1
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
1.0
\n",
"
0.541855
\n",
"
3
\n",
"
\n",
"
\n",
"
7
\n",
"
4.0
\n",
"
1.0
\n",
"
80.0
\n",
"
1.0
\n",
"
0.0
\n",
"
63.206030
\n",
"
63.206030
\n",
"
0.000000
\n",
"
0.0
\n",
"
1
\n",
"
...
\n",
"
True
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.461164
\n",
"
2
\n",
"
\n",
"
\n",
"
8
\n",
"
1.0
\n",
"
1.0
\n",
"
10.0
\n",
"
1.0
\n",
"
0.0
\n",
"
44.698090
\n",
"
44.698090
\n",
"
0.000000
\n",
"
0.0
\n",
"
1
\n",
"
...
\n",
"
True
\n",
"
0
\n",
"
0
\n",
"
1
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.310828
\n",
"
2
\n",
"
\n",
"
\n",
"
9
\n",
"
3.0
\n",
"
3.0
\n",
"
165.0
\n",
"
1.0
\n",
"
1.0
\n",
"
266.012106
\n",
"
258.012106
\n",
"
8.000000
\n",
"
3.0
\n",
"
2
\n",
"
...
\n",
"
False
\n",
"
0
\n",
"
0
\n",
"
1
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.452877
\n",
"
2
\n",
"
\n",
" \n",
"
\n",
"
10 rows × 21 columns
\n",
"
"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers vente_internet_max \\\n",
"0 4.0 1.0 100.0 1.0 0.0 \n",
"1 1.0 1.0 55.0 1.0 0.0 \n",
"2 17.0 1.0 80.0 1.0 0.0 \n",
"3 4.0 1.0 120.0 1.0 0.0 \n",
"4 34.0 2.0 416.0 1.0 0.0 \n",
"5 2.0 1.0 60.0 1.0 0.0 \n",
"6 5.0 1.0 61.0 1.0 1.0 \n",
"7 4.0 1.0 80.0 1.0 0.0 \n",
"8 1.0 1.0 10.0 1.0 0.0 \n",
"9 3.0 3.0 165.0 1.0 1.0 \n",
"\n",
" purchase_date_min purchase_date_max time_between_purchase \\\n",
"0 5.177187 5.177187 0.000000 \n",
"1 426.265613 426.265613 0.000000 \n",
"2 436.033437 436.033437 0.000000 \n",
"3 5.196412 5.196412 0.000000 \n",
"4 478.693148 115.631470 363.061678 \n",
"5 5.140069 5.140069 0.000000 \n",
"6 105.053773 105.053773 0.000000 \n",
"7 63.206030 63.206030 0.000000 \n",
"8 44.698090 44.698090 0.000000 \n",
"9 266.012106 258.012106 8.000000 \n",
"\n",
" nb_tickets_internet fidelity ... opt_in gender_female gender_male \\\n",
"0 0.0 1 ... False 1 0 \n",
"1 0.0 2 ... True 0 1 \n",
"2 0.0 2 ... True 1 0 \n",
"3 0.0 1 ... False 1 0 \n",
"4 0.0 4 ... False 1 0 \n",
"5 0.0 1 ... False 0 1 \n",
"6 5.0 1 ... False 0 0 \n",
"7 0.0 1 ... True 0 1 \n",
"8 0.0 1 ... True 0 0 \n",
"9 3.0 2 ... False 0 0 \n",
"\n",
" gender_other nb_campaigns nb_campaigns_opened has_purchased \\\n",
"0 0 0.0 0.0 0.0 \n",
"1 0 0.0 0.0 1.0 \n",
"2 0 0.0 0.0 0.0 \n",
"3 0 0.0 0.0 0.0 \n",
"4 0 0.0 0.0 1.0 \n",
"5 0 0.0 0.0 0.0 \n",
"6 1 0.0 0.0 0.0 \n",
"7 0 0.0 0.0 0.0 \n",
"8 1 0.0 0.0 0.0 \n",
"9 1 0.0 0.0 0.0 \n",
"\n",
" has_purchased_estim score quartile \n",
"0 1.0 0.657671 3 \n",
"1 0.0 0.266538 2 \n",
"2 0.0 0.214668 1 \n",
"3 1.0 0.657770 3 \n",
"4 1.0 0.894173 4 \n",
"5 1.0 0.717482 3 \n",
"6 1.0 0.541855 3 \n",
"7 0.0 0.461164 2 \n",
"8 0.0 0.310828 2 \n",
"9 0.0 0.452877 2 \n",
"\n",
"[10 rows x 21 columns]"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment = X_test\n",
"\n",
"X_test_segment[\"has_purchased\"] = y_test\n",
"X_test_segment[\"has_purchased_estim\"] = y_pred\n",
"X_test_segment[\"score\"] = y_pred_prob\n",
"X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n",
" np.where(X_test['score']<0.5, '2',\n",
" np.where(X_test['score']<0.75, '3', '4')))\n",
"X_test_segment.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0916f099-3faa-4c47-9b60-d1ee797b3c9d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "ad16b8ab-7e01-404b-971e-866e9b9d5aa4",
"metadata": {},
"source": [
"## definition of functions to compute the bias of scores and adjust it \n",
"\n",
"Le biais est calculé de la façon suivante. \n",
"En notant $\\hat{p(x_i)}$ le score calculé (estimé par la modélisation) et $p(x_i)$ le vrai score (sans biais), et $\\beta$ le logarithme du biais, on a : \\\n",
"$\\ln{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}} = \\beta + \\ln{\\frac{p(x_i)}{1-p(x_i)}}$ \\\n",
"$ \\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}} = \\exp(\\beta) . \\frac{p(x_i)}{1-p(x_i)} $ , soit : \\\n",
"$p(x_i) = {\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}}$ \\\n",
"Ce qu'on appelle biais et qu'on estime dans le code par la suite est : $B=\\exp(\\beta) $. Les probabilités ne sont donc pas biaisées si $B=1$. Il y a surestimation si $B>1$. \n",
"\n",
"On cherche le B qui permette d'ajuster les probabilités de telle sorte que la somme des scores soit égale à la somme des y_has_purchased. Cela revient à résoudre : \n",
"\n",
"\\begin{equation}\n",
"\\sum_{i}{\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}} = \\sum_{i}{Y_i}\n",
"\\end{equation}\n",
"\n",
"C'est ce que fait la fonction find_bias. \n",
"\n",
"Note sur les notations : \\\n",
"$\\hat{p(x_i)}$ correspond à ce qu'on appelle le score et $p(x_i)$ à ce qu'on appellera le score adjusted"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "f0379536-a6c5-4b16-bde5-d0319ec1b140",
"metadata": {},
"outputs": [],
"source": [
"# compute adjusted score from odd ratios (cf formula above)\n",
"def adjusted_score(odd_ratio, bias) :\n",
" adjusted_score = odd_ratio/(bias+odd_ratio)\n",
" return adjusted_score"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "32a0dfd0-f49d-4785-a56f-706d381bfe41",
"metadata": {},
"outputs": [],
"source": [
"# when the score is 1 we cannot compute the odd ratio, so we adjust scores equal to 1\n",
"# we set the second best score instead\n",
"\n",
"def adjust_score_1(score) :\n",
" second_best_score = np.array([element for element in score if element !=1]).max()\n",
" new_score = np.array([element if element!=1 else second_best_score for element in score]) \n",
" return new_score"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "2dff1def-02df-413e-afce-b4aeaf7752b6",
"metadata": {},
"outputs": [],
"source": [
"def odd_ratio(score) :\n",
" return score / (1 - score)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "683d71fc-7442-4028-869c-49c57592d6e9",
"metadata": {},
"outputs": [],
"source": [
"# definition of a function that automatically detects the bias\n",
"\n",
"def find_bias(odd_ratios, y_objective, initial_guess=6) :\n",
" \"\"\"\n",
" results = minimize(lambda bias : (sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective)**2 ,\n",
" initial_guess , method = \"BFGS\")\n",
"\n",
" estimated_bias = results.x[0]\n",
" \"\"\"\n",
"\n",
" # faster method\n",
" bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n",
" \n",
" return bias_estimated[0]"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "781b0d40-c954-4c54-830a-e709c8667328",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6.172331113516847"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# computation with the function defined\n",
"\n",
"bias_test_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_test_segment[\"score\"])), \n",
" y_objective = y_test[\"y_has_purchased\"].sum(),\n",
" initial_guess=6)\n",
"bias_test_set"
]
},
{
"cell_type": "code",
"execution_count": 102,
"id": "248cb862-418e-4767-9933-70c4885ecf40",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6.070461139075353"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# comparison with bias of the train set\n",
"X_train_score = logit_cv.predict_proba(X_train)[:, 1]\n",
"\n",
"bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), \n",
" y_objective = y_train[\"y_has_purchased\"].sum(),\n",
" initial_guess=6)\n",
"bias_train_set"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "fff6cbe6-7bb3-4732-9b81-b9ac5383bbcf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"betâ test - betâ train = 0.016642008368292337\n"
]
}
],
"source": [
"print(\"betâ test - betâ train = \",np.log(bias_test_set/bias_train_set))"
]
},
{
"cell_type": "code",
"execution_count": 116,
"id": "f506870d-4a8a-4b2c-8f0b-e0789080b20c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean absolute erreur 0.001409799678121875\n"
]
}
],
"source": [
"# impact of considering a bias computed on train set instead of test set - totally neglectable\n",
"\n",
"score_adjusted_test = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"\n",
"print(\"mean absolute erreur\",abs(score_adjusted_test-score_adjusted_train).mean())"
]
},
{
"cell_type": "code",
"execution_count": 117,
"id": "8213d0e4-063b-49fa-90b7-677fc34f4c01",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_620/1825363704.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"score_adjusted\"] = score_adjusted_train\n"
]
}
],
"source": [
"# adjust scores accordingly \n",
"\n",
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"\n",
"# actually, we are not supposed to have X_test, so the biais is estimated on X_train\n",
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"X_test_segment[\"score_adjusted\"] = score_adjusted_train"
]
},
{
"cell_type": "code",
"execution_count": 118,
"id": "834d3723-2e72-4c65-9c62-e2d595c69461",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE for score : 0.15494387585189107\n",
"MSE for ajusted score : 0.08851697393139933\n",
"sum of y_has_purchased : 13690.0\n",
"sum of adjusted scores : 13825.476109871417\n"
]
}
],
"source": [
"# check \n",
"\n",
"MSE_score = ((X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"MSE_ajusted_score = ((X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"print(f\"MSE for score : {MSE_score}\")\n",
"print(f\"MSE for ajusted score : {MSE_ajusted_score}\")\n",
"\n",
"print(\"sum of y_has_purchased :\",y_test[\"y_has_purchased\"].sum())\n",
"print(\"sum of adjusted scores :\", X_test_segment[\"score_adjusted\"].sum())"
]
},
{
"cell_type": "code",
"execution_count": 130,
"id": "ed27a165-68d2-44f8-8cec-b12dad2cca5d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"29169.0"
]
},
"execution_count": 130,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment[\"has_purchased_estim\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "761146b7-3d0d-44b1-8b91-87e6d54f1626",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 119,
"id": "9f30a4dd-a9d8-405a-a7d5-5324ae88cf70",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MAE for score : 0.32116357895490416\n",
"MAE for adjusted score : 0.17359227315595824\n"
]
}
],
"source": [
"# mean absolute error - divided by 2 with out method\n",
"\n",
"MAE_score = abs(X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"MAE_ajusted_score = abs(X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"print(f\"MAE for score : {MAE_score}\")\n",
"print(f\"MAE for adjusted score : {MAE_ajusted_score}\")"
]
},
{
"cell_type": "code",
"execution_count": 208,
"id": "6f9396db-e213-408c-a596-eaeec3bc79f3",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# visualization\n",
"\n",
"# histogramme des probas et des probas ajustées\n",
"\n",
"def plot_comp_scores(df, score, score_adjusted) :\n",
"\n",
" plt.figure()\n",
" plt.hist(df[score], label = \"score\", alpha=0.6)\n",
" plt.hist(df[score_adjusted], label=\"adjusted score\", alpha=0.6)\n",
" plt.legend()\n",
" plt.xlabel(\"probability of a future purchase\")\n",
" plt.ylabel(\"count\")\n",
" plt.title(\"Comparison between score and adjusted score\")\n",
" plt.show()\n",
"\n",
"plot_comp_scores(X_test_segment, score = \"score\", score_adjusted = \"score_adjusted\")"
]
},
{
"cell_type": "markdown",
"id": "e6fae260-fab8-4f51-90dc-9b6d7314c77b",
"metadata": {},
"source": [
"## Compute number of tickets and CA by segment with the recalibrated score"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "c618cebc-c295-47f7-bd76-b7e18778a17c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
"
],
"text/plain": [
" quartile size size_perct nb_tickets_expected total_amount_expected \\\n",
"0 1 37410 38.929820 84.764915 1.867190e+03 \n",
"1 2 29517 30.716159 2899.288091 7.446102e+04 \n",
"2 3 20137 20.955087 10876.786661 3.442867e+05 \n",
"3 4 9032 9.398934 215194.829104 9.899418e+06 \n",
"\n",
" perct_revenue_recovered \n",
"0 4.384354 \n",
"1 9.854069 \n",
"2 22.842135 \n",
"3 90.107285 "
]
},
"execution_count": 169,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# compute nb tickets estimated and total amount expected\n",
"X_test_expected_CA = X_test_segment.groupby(\"quartile\")[[\"nb_tickets_expected\", \"total_amount_expected\"]].sum().reset_index()\n",
"\n",
"# number of customers by segment\n",
"X_test_expected_CA.insert(1, \"size\", X_test_segment.groupby(\"quartile\").size().values)\n",
"\n",
"# size in percent of all customers\n",
"X_test_expected_CA.insert(2, \"size_perct\", 100 * X_test_expected_CA[\"size\"]/X_test_expected_CA[\"size\"].sum())\n",
"\n",
"# compute share of CA recovered\n",
"duration_ref=1.5\n",
"duration_projection=1\n",
"duration_ratio=duration_ref/duration_projection\n",
"\n",
"X_test_expected_CA[\"perct_revenue_recovered\"] = 100 * duration_ratio * X_test_expected_CA[\"total_amount_expected\"] / \\\n",
"X_test_segment.groupby(\"quartile\")[\"total_amount\"].sum().values\n",
"\n",
"X_test_expected_CA"
]
},
{
"cell_type": "markdown",
"id": "9c471bdd-25c2-420a-a8a1-3add9f003cbc",
"metadata": {},
"source": [
"## Just to try, same computation with score instead of score adjusted\n",
"\n",
"seems overestimated : if only 14% of customers come back, how can we recover 22% of the revenue from the segment that is least likely to buy ?? ..."
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "53684a24-1809-465f-8e21-b9295e34582a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_620/3599949626.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"nb_tickets_projected\"] = df_output[nb_tickets] / duration_ratio\n",
"/tmp/ipykernel_620/3599949626.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"total_amount_projected\"] = df_output[total_amount] / duration_ratio\n",
"/tmp/ipykernel_620/3599949626.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"nb_tickets_expected\"] = df_output[score_adjusted] * df_output[\"nb_tickets_projected\"]\n",
"/tmp/ipykernel_620/3599949626.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output[\"total_amount_expected\"] = df_output[score_adjusted] * df_output[\"total_amount_projected\"]\n"
]
},
{
"data": {
"text/html": [
"