In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
"
],
"text/plain": [
"GridSearchCV(cv=3, error_score='raise',\n",
" estimator=Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num',\n",
" Pipeline(steps=[('scaler',\n",
" StandardScaler())]),\n",
" ['nb_tickets',\n",
" 'nb_purchases',\n",
" 'total_amount',\n",
" 'nb_suppliers',\n",
" 'vente_internet_max',\n",
" 'purchase_date_min',\n",
" 'purchase_date_max',\n",
" 'time_between_purchase',\n",
" 'nb_tickets_internet',\n",
" 'nb_campaigns',\n",
" 'nb_...\n",
" 1.562500e-02, 3.125000e-02, 6.250000e-02, 1.250000e-01,\n",
" 2.500000e-01, 5.000000e-01, 1.000000e+00, 2.000000e+00,\n",
" 4.000000e+00, 8.000000e+00, 1.600000e+01, 3.200000e+01,\n",
" 6.400000e+01]),\n",
" 'LogisticRegression_cv__class_weight': ['balanced',\n",
" {0.0: 0.5223906809346011,\n",
" 1.0: 11.665359406898034}],\n",
" 'LogisticRegression_cv__penalty': ['l1', 'l2']},\n",
" scoring=make_scorer(recall_score, response_method='predict'))"
]
},
"execution_count": 286,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = load_model(type_of_activity, \"LogisticRegression_cv\")\n",
"# model = load_model(type_of_activity, \"randomF_cv\")\n",
"model"
]
},
{
"cell_type": "markdown",
"id": "006819e7-e9c5-48d9-85ee-aa43d5e4c9c2",
"metadata": {},
"source": [
"## Quartile clustering"
]
},
{
"cell_type": "code",
"execution_count": 287,
"id": "018d8ff4-3436-4eec-8507-d1a265cbabf1",
"metadata": {},
"outputs": [],
"source": [
"y_pred = model.predict(X_test)\n",
"y_pred_prob = model.predict_proba(X_test)[:, 1]"
]
},
{
"cell_type": "code",
"execution_count": 288,
"id": "846f53b9-73c2-4a8b-9d9e-f11bf59ce9ba",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1080/375041546.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"has_purchased\"] = y_test\n",
"/tmp/ipykernel_1080/375041546.py:4: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"has_purchased_estim\"] = y_pred\n",
"/tmp/ipykernel_1080/375041546.py:5: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"score\"] = y_pred_prob\n",
"/tmp/ipykernel_1080/375041546.py:6: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
nb_tickets
\n",
"
nb_purchases
\n",
"
total_amount
\n",
"
nb_suppliers
\n",
"
vente_internet_max
\n",
"
purchase_date_min
\n",
"
purchase_date_max
\n",
"
time_between_purchase
\n",
"
nb_tickets_internet
\n",
"
fidelity
\n",
"
...
\n",
"
gender_female
\n",
"
gender_male
\n",
"
gender_other
\n",
"
nb_campaigns
\n",
"
nb_campaigns_opened
\n",
"
has_purchased
\n",
"
has_purchased_estim
\n",
"
score
\n",
"
quartile
\n",
"
score_adjusted
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
2.0
\n",
"
1.0
\n",
"
22.0
\n",
"
1.0
\n",
"
1.0
\n",
"
307.203553
\n",
"
307.203553
\n",
"
0.000000
\n",
"
2.0
\n",
"
1
\n",
"
...
\n",
"
0
\n",
"
0
\n",
"
1
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.367961
\n",
"
2
\n",
"
0.010594
\n",
"
\n",
"
\n",
"
1
\n",
"
269.0
\n",
"
8.0
\n",
"
50.0
\n",
"
2.0
\n",
"
1.0
\n",
"
378.208090
\n",
"
39.389595
\n",
"
338.818495
\n",
"
66.0
\n",
"
10
\n",
"
...
\n",
"
0
\n",
"
0
\n",
"
1
\n",
"
65.0
\n",
"
1.0
\n",
"
1.0
\n",
"
1.0
\n",
"
0.998731
\n",
"
4
\n",
"
0.397108
\n",
"
\n",
"
\n",
"
2
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
550.000000
\n",
"
550.000000
\n",
"
-1.000000
\n",
"
0.0
\n",
"
0
\n",
"
...
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
4.0
\n",
"
2.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.211997
\n",
"
1
\n",
"
0.014916
\n",
"
\n",
"
\n",
"
3
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
550.000000
\n",
"
550.000000
\n",
"
-1.000000
\n",
"
0.0
\n",
"
0
\n",
"
...
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
2.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.246563
\n",
"
1
\n",
"
0.024670
\n",
"
\n",
"
\n",
"
4
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
550.000000
\n",
"
550.000000
\n",
"
-1.000000
\n",
"
0.0
\n",
"
0
\n",
"
...
\n",
"
0
\n",
"
0
\n",
"
1
\n",
"
4.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.108575
\n",
"
1
\n",
"
0.025205
\n",
"
\n",
"
\n",
"
5
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
550.000000
\n",
"
550.000000
\n",
"
-1.000000
\n",
"
0.0
\n",
"
0
\n",
"
...
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
7.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.257244
\n",
"
2
\n",
"
0.046644
\n",
"
\n",
"
\n",
"
6
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
550.000000
\n",
"
550.000000
\n",
"
-1.000000
\n",
"
0.0
\n",
"
1
\n",
"
...
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
2.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.203196
\n",
"
1
\n",
"
0.023026
\n",
"
\n",
"
\n",
"
7
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
550.000000
\n",
"
550.000000
\n",
"
-1.000000
\n",
"
0.0
\n",
"
0
\n",
"
...
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
10.0
\n",
"
8.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.240049
\n",
"
1
\n",
"
0.003825
\n",
"
\n",
"
\n",
"
8
\n",
"
1.0
\n",
"
1.0
\n",
"
11.0
\n",
"
1.0
\n",
"
1.0
\n",
"
456.255104
\n",
"
456.255104
\n",
"
0.000000
\n",
"
1.0
\n",
"
1
\n",
"
...
\n",
"
0
\n",
"
0
\n",
"
1
\n",
"
3.0
\n",
"
3.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.340098
\n",
"
2
\n",
"
0.006850
\n",
"
\n",
"
\n",
"
9
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
550.000000
\n",
"
550.000000
\n",
"
-1.000000
\n",
"
0.0
\n",
"
0
\n",
"
...
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
10.0
\n",
"
6.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.234470
\n",
"
1
\n",
"
0.003745
\n",
"
\n",
" \n",
"
\n",
"
10 rows × 22 columns
\n",
"
"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers vente_internet_max \\\n",
"0 2.0 1.0 22.0 1.0 1.0 \n",
"1 269.0 8.0 50.0 2.0 1.0 \n",
"2 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 0.0 \n",
"5 0.0 0.0 0.0 0.0 0.0 \n",
"6 0.0 0.0 0.0 0.0 0.0 \n",
"7 0.0 0.0 0.0 0.0 0.0 \n",
"8 1.0 1.0 11.0 1.0 1.0 \n",
"9 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" purchase_date_min purchase_date_max time_between_purchase \\\n",
"0 307.203553 307.203553 0.000000 \n",
"1 378.208090 39.389595 338.818495 \n",
"2 550.000000 550.000000 -1.000000 \n",
"3 550.000000 550.000000 -1.000000 \n",
"4 550.000000 550.000000 -1.000000 \n",
"5 550.000000 550.000000 -1.000000 \n",
"6 550.000000 550.000000 -1.000000 \n",
"7 550.000000 550.000000 -1.000000 \n",
"8 456.255104 456.255104 0.000000 \n",
"9 550.000000 550.000000 -1.000000 \n",
"\n",
" nb_tickets_internet fidelity ... gender_female gender_male \\\n",
"0 2.0 1 ... 0 0 \n",
"1 66.0 10 ... 0 0 \n",
"2 0.0 0 ... 0 1 \n",
"3 0.0 0 ... 1 0 \n",
"4 0.0 0 ... 0 0 \n",
"5 0.0 0 ... 1 0 \n",
"6 0.0 1 ... 0 1 \n",
"7 0.0 0 ... 0 1 \n",
"8 1.0 1 ... 0 0 \n",
"9 0.0 0 ... 0 1 \n",
"\n",
" gender_other nb_campaigns nb_campaigns_opened has_purchased \\\n",
"0 1 0.0 0.0 0.0 \n",
"1 1 65.0 1.0 1.0 \n",
"2 0 4.0 2.0 0.0 \n",
"3 0 2.0 0.0 0.0 \n",
"4 1 4.0 0.0 0.0 \n",
"5 0 7.0 0.0 0.0 \n",
"6 0 2.0 0.0 0.0 \n",
"7 0 10.0 8.0 0.0 \n",
"8 1 3.0 3.0 0.0 \n",
"9 0 10.0 6.0 0.0 \n",
"\n",
" has_purchased_estim score quartile score_adjusted \n",
"0 0.0 0.367961 2 0.010594 \n",
"1 1.0 0.998731 4 0.397108 \n",
"2 0.0 0.211997 1 0.014916 \n",
"3 0.0 0.246563 1 0.024670 \n",
"4 0.0 0.108575 1 0.025205 \n",
"5 0.0 0.257244 2 0.046644 \n",
"6 0.0 0.203196 1 0.023026 \n",
"7 0.0 0.240049 1 0.003825 \n",
"8 0.0 0.340098 2 0.006850 \n",
"9 0.0 0.234470 1 0.003745 \n",
"\n",
"[10 rows x 22 columns]"
]
},
"execution_count": 288,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_segment = X_test\n",
"\n",
"X_test_segment[\"has_purchased\"] = y_test\n",
"X_test_segment[\"has_purchased_estim\"] = y_pred\n",
"X_test_segment[\"score\"] = y_pred_prob\n",
"X_test_segment[\"quartile\"] = np.where(X_test['score']<0.25, '1',\n",
" np.where(X_test['score']<0.5, '2',\n",
" np.where(X_test['score']<0.75, '3', '4')))\n",
"X_test_segment.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "fb592fe3-ea40-4e83-8fe9-c52b9ee42f2a",
"metadata": {},
"outputs": [],
"source": [
"def df_segment(df, y, model) :\n",
"\n",
" y_pred = model.predict(df)\n",
" y_pred_prob = model.predict_proba(df)[:, 1]\n",
"\n",
" df_segment = df\n",
"\n",
" df_segment[\"has_purchased\"] = y\n",
" df_segment[\"has_purchased_estim\"] = y_pred\n",
" df_segment[\"score\"] = y_pred_prob\n",
" df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n",
" np.where(df_segment['score']<0.5, '2',\n",
" np.where(df_segment['score']<0.75, '3', '4')))\n",
"\n",
" return df_segment"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "968645d5-58cc-485a-bd8b-99f4cfc26fec",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1080/2624515794.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"has_purchased\"] = y\n",
"/tmp/ipykernel_1080/2624515794.py:9: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"has_purchased_estim\"] = y_pred\n",
"/tmp/ipykernel_1080/2624515794.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"score\"] = y_pred_prob\n",
"/tmp/ipykernel_1080/2624515794.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_segment[\"quartile\"] = np.where(df_segment['score']<0.25, '1',\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
nb_tickets
\n",
"
nb_purchases
\n",
"
total_amount
\n",
"
nb_suppliers
\n",
"
vente_internet_max
\n",
"
purchase_date_min
\n",
"
purchase_date_max
\n",
"
time_between_purchase
\n",
"
nb_tickets_internet
\n",
"
fidelity
\n",
"
...
\n",
"
opt_in
\n",
"
gender_female
\n",
"
gender_male
\n",
"
gender_other
\n",
"
nb_campaigns
\n",
"
nb_campaigns_opened
\n",
"
has_purchased
\n",
"
has_purchased_estim
\n",
"
score
\n",
"
quartile
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
4.0
\n",
"
1.0
\n",
"
100.00
\n",
"
1.0
\n",
"
0.0
\n",
"
5.177187
\n",
"
5.177187
\n",
"
0.000000
\n",
"
0.0
\n",
"
1
\n",
"
...
\n",
"
False
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.006066
\n",
"
1
\n",
"
\n",
"
\n",
"
1
\n",
"
1.0
\n",
"
1.0
\n",
"
55.00
\n",
"
1.0
\n",
"
0.0
\n",
"
426.265613
\n",
"
426.265613
\n",
"
0.000000
\n",
"
0.0
\n",
"
2
\n",
"
...
\n",
"
True
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
1.0
\n",
"
0.0
\n",
"
0.288847
\n",
"
2
\n",
"
\n",
"
\n",
"
2
\n",
"
17.0
\n",
"
1.0
\n",
"
80.00
\n",
"
1.0
\n",
"
0.0
\n",
"
436.033437
\n",
"
436.033437
\n",
"
0.000000
\n",
"
0.0
\n",
"
2
\n",
"
...
\n",
"
True
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.103264
\n",
"
1
\n",
"
\n",
"
\n",
"
3
\n",
"
4.0
\n",
"
1.0
\n",
"
120.00
\n",
"
1.0
\n",
"
0.0
\n",
"
5.196412
\n",
"
5.196412
\n",
"
0.000000
\n",
"
0.0
\n",
"
1
\n",
"
...
\n",
"
False
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.008928
\n",
"
1
\n",
"
\n",
"
\n",
"
4
\n",
"
34.0
\n",
"
2.0
\n",
"
416.00
\n",
"
1.0
\n",
"
0.0
\n",
"
478.693148
\n",
"
115.631470
\n",
"
363.061678
\n",
"
0.0
\n",
"
4
\n",
"
...
\n",
"
False
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
0.0
\n",
"
0.0
\n",
"
1.0
\n",
"
1.0
\n",
"
0.992809
\n",
"
4
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
96091
\n",
"
1.0
\n",
"
1.0
\n",
"
67.31
\n",
"
1.0
\n",
"
1.0
\n",
"
278.442257
\n",
"
278.442257
\n",
"
0.000000
\n",
"
1.0
\n",
"
2
\n",
"
...
\n",
"
False
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
15.0
\n",
"
5.0
\n",
"
1.0
\n",
"
0.0
\n",
"
0.351762
\n",
"
2
\n",
"
\n",
"
\n",
"
96092
\n",
"
1.0
\n",
"
1.0
\n",
"
61.41
\n",
"
1.0
\n",
"
1.0
\n",
"
189.207373
\n",
"
189.207373
\n",
"
0.000000
\n",
"
1.0
\n",
"
1
\n",
"
...
\n",
"
False
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
12.0
\n",
"
9.0
\n",
"
0.0
\n",
"
1.0
\n",
"
0.567814
\n",
"
3
\n",
"
\n",
"
\n",
"
96093
\n",
"
0.0
\n",
"
0.0
\n",
"
0.00
\n",
"
0.0
\n",
"
0.0
\n",
"
550.000000
\n",
"
550.000000
\n",
"
-1.000000
\n",
"
0.0
\n",
"
1
\n",
"
...
\n",
"
True
\n",
"
1
\n",
"
0
\n",
"
0
\n",
"
29.0
\n",
"
3.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.004652
\n",
"
1
\n",
"
\n",
"
\n",
"
96094
\n",
"
1.0
\n",
"
1.0
\n",
"
79.43
\n",
"
1.0
\n",
"
1.0
\n",
"
279.312905
\n",
"
279.312905
\n",
"
0.000000
\n",
"
1.0
\n",
"
1
\n",
"
...
\n",
"
False
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
20.0
\n",
"
4.0
\n",
"
0.0
\n",
"
0.0
\n",
"
0.293042
\n",
"
2
\n",
"
\n",
"
\n",
"
96095
\n",
"
0.0
\n",
"
0.0
\n",
"
0.00
\n",
"
0.0
\n",
"
0.0
\n",
"
550.000000
\n",
"
550.000000
\n",
"
-1.000000
\n",
"
0.0
\n",
"
2
\n",
"
...
\n",
"
False
\n",
"
0
\n",
"
1
\n",
"
0
\n",
"
31.0
\n",
"
4.0
\n",
"
0.0
\n",
"
1.0
\n",
"
0.787852
\n",
"
4
\n",
"
\n",
" \n",
"
\n",
"
96096 rows × 21 columns
\n",
"
"
],
"text/plain": [
" nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 4.0 1.0 100.00 1.0 \n",
"1 1.0 1.0 55.00 1.0 \n",
"2 17.0 1.0 80.00 1.0 \n",
"3 4.0 1.0 120.00 1.0 \n",
"4 34.0 2.0 416.00 1.0 \n",
"... ... ... ... ... \n",
"96091 1.0 1.0 67.31 1.0 \n",
"96092 1.0 1.0 61.41 1.0 \n",
"96093 0.0 0.0 0.00 0.0 \n",
"96094 1.0 1.0 79.43 1.0 \n",
"96095 0.0 0.0 0.00 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 5.177187 5.177187 \n",
"1 0.0 426.265613 426.265613 \n",
"2 0.0 436.033437 436.033437 \n",
"3 0.0 5.196412 5.196412 \n",
"4 0.0 478.693148 115.631470 \n",
"... ... ... ... \n",
"96091 1.0 278.442257 278.442257 \n",
"96092 1.0 189.207373 189.207373 \n",
"96093 0.0 550.000000 550.000000 \n",
"96094 1.0 279.312905 279.312905 \n",
"96095 0.0 550.000000 550.000000 \n",
"\n",
" time_between_purchase nb_tickets_internet fidelity ... opt_in \\\n",
"0 0.000000 0.0 1 ... False \n",
"1 0.000000 0.0 2 ... True \n",
"2 0.000000 0.0 2 ... True \n",
"3 0.000000 0.0 1 ... False \n",
"4 363.061678 0.0 4 ... False \n",
"... ... ... ... ... ... \n",
"96091 0.000000 1.0 2 ... False \n",
"96092 0.000000 1.0 1 ... False \n",
"96093 -1.000000 0.0 1 ... True \n",
"96094 0.000000 1.0 1 ... False \n",
"96095 -1.000000 0.0 2 ... False \n",
"\n",
" gender_female gender_male gender_other nb_campaigns \\\n",
"0 1 0 0 0.0 \n",
"1 0 1 0 0.0 \n",
"2 1 0 0 0.0 \n",
"3 1 0 0 0.0 \n",
"4 1 0 0 0.0 \n",
"... ... ... ... ... \n",
"96091 0 1 0 15.0 \n",
"96092 0 1 0 12.0 \n",
"96093 1 0 0 29.0 \n",
"96094 0 1 0 20.0 \n",
"96095 0 1 0 31.0 \n",
"\n",
" nb_campaigns_opened has_purchased has_purchased_estim score \\\n",
"0 0.0 0.0 0.0 0.006066 \n",
"1 0.0 1.0 0.0 0.288847 \n",
"2 0.0 0.0 0.0 0.103264 \n",
"3 0.0 0.0 0.0 0.008928 \n",
"4 0.0 1.0 1.0 0.992809 \n",
"... ... ... ... ... \n",
"96091 5.0 1.0 0.0 0.351762 \n",
"96092 9.0 0.0 1.0 0.567814 \n",
"96093 3.0 0.0 0.0 0.004652 \n",
"96094 4.0 0.0 0.0 0.293042 \n",
"96095 4.0 0.0 1.0 0.787852 \n",
"\n",
" quartile \n",
"0 1 \n",
"1 2 \n",
"2 1 \n",
"3 1 \n",
"4 4 \n",
"... ... \n",
"96091 2 \n",
"96092 3 \n",
"96093 1 \n",
"96094 2 \n",
"96095 4 \n",
"\n",
"[96096 rows x 21 columns]"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_segment(X_test, y_test, model)"
]
},
{
"cell_type": "markdown",
"id": "ad16b8ab-7e01-404b-971e-866e9b9d5aa4",
"metadata": {},
"source": [
"## definition of functions to compute the bias of scores and adjust it \n",
"\n",
"Le biais est calculé de la façon suivante. \n",
"En notant $\\hat{p(x_i)}$ le score calculé (estimé par la modélisation) et $p(x_i)$ le vrai score (sans biais), et $\\beta$ le logarithme du biais, on a : \\\n",
"$\\ln{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}} = \\beta + \\ln{\\frac{p(x_i)}{1-p(x_i)}}$ \\\n",
"$ \\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}} = \\exp(\\beta) . \\frac{p(x_i)}{1-p(x_i)} $ , soit : \\\n",
"$p(x_i) = {\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}}$ \\\n",
"Ce qu'on appelle biais et qu'on estime dans le code par la suite est : $B=\\exp(\\beta) $. Les probabilités ne sont donc pas biaisées si $B=1$. Il y a surestimation si $B>1$. \n",
"\n",
"On cherche le B qui permette d'ajuster les probabilités de telle sorte que la somme des scores soit égale à la somme des y_has_purchased. Cela revient à résoudre : \n",
"\n",
"\\begin{equation}\n",
"\\sum_{i}{\\frac{\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}{B+\\frac{\\hat{p(x_i)}}{1-\\hat{p(x_i)}}}} = \\sum_{i}{Y_i}\n",
"\\end{equation}\n",
"\n",
"C'est ce que fait la fonction find_bias. \n",
"\n",
"Note sur les notations : \\\n",
"$\\hat{p(x_i)}$ correspond à ce qu'on appelle le score et $p(x_i)$ à ce qu'on appellera le score adjusted"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "f0379536-a6c5-4b16-bde5-d0319ec1b140",
"metadata": {},
"outputs": [],
"source": [
"# compute adjusted score from odd ratios (cf formula above)\n",
"def adjusted_score(odd_ratio, bias) :\n",
" adjusted_score = odd_ratio/(bias+odd_ratio)\n",
" return adjusted_score"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "32a0dfd0-f49d-4785-a56f-706d381bfe41",
"metadata": {},
"outputs": [],
"source": [
"# when the score is 1 we cannot compute the odd ratio, so we adjust scores equal to 1\n",
"# we set the second best score instead\n",
"\n",
"def adjust_score_1(score) :\n",
" second_best_score = np.array([element for element in score if element !=1]).max()\n",
" new_score = np.array([element if element!=1 else second_best_score for element in score]) \n",
" return new_score"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "2dff1def-02df-413e-afce-b4aeaf7752b6",
"metadata": {},
"outputs": [],
"source": [
"def odd_ratio(score) :\n",
" return score / (1 - score)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "683d71fc-7442-4028-869c-49c57592d6e9",
"metadata": {},
"outputs": [],
"source": [
"# definition of a function that automatically detects the bias\n",
"\n",
"def find_bias(odd_ratios, y_objective, initial_guess=6) :\n",
" \"\"\"\n",
" results = minimize(lambda bias : (sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective)**2 ,\n",
" initial_guess , method = \"BFGS\")\n",
"\n",
" estimated_bias = results.x[0]\n",
" \"\"\"\n",
"\n",
" # faster method\n",
" bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=6)\n",
" \n",
" return bias_estimated[0]"
]
},
{
"cell_type": "code",
"execution_count": 289,
"id": "f17dc6ca-7a48-441b-8c04-11c47b8b9741",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.3940650533525649 0.04284869976359338\n"
]
},
{
"data": {
"text/plain": [
"0.04286194557403322"
]
},
"execution_count": 289,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(X_test_segment[\"score\"].mean(), y_test[\"y_has_purchased\"].mean())\n",
"y_train[\"y_has_purchased\"].mean()"
]
},
{
"cell_type": "code",
"execution_count": 290,
"id": "781b0d40-c954-4c54-830a-e709c8667328",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"22.577005337484817"
]
},
"execution_count": 290,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# computation with the function defined\n",
"\n",
"bias_test_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_test_segment[\"score\"])), \n",
" y_objective = y_test[\"y_has_purchased\"].sum(),\n",
" initial_guess=6)\n",
"bias_test_set"
]
},
{
"cell_type": "code",
"execution_count": 291,
"id": "248cb862-418e-4767-9933-70c4885ecf40",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"22.690061493186622"
]
},
"execution_count": 291,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# comparison with bias of the train set\n",
"X_train_score = model.predict_proba(X_train)[:, 1]\n",
"\n",
"bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), \n",
" y_objective = y_train[\"y_has_purchased\"].sum(),\n",
" initial_guess=6)\n",
"bias_train_set"
]
},
{
"cell_type": "code",
"execution_count": 292,
"id": "fff6cbe6-7bb3-4732-9b81-b9ac5383bbcf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"betâ test - betâ train = -0.0049950835646278635\n"
]
}
],
"source": [
"print(\"betâ test - betâ train = \",np.log(bias_test_set/bias_train_set))"
]
},
{
"cell_type": "code",
"execution_count": 293,
"id": "f506870d-4a8a-4b2c-8f0b-e0789080b20c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean absolute erreur 0.00017894295558797563\n"
]
}
],
"source": [
"# impact of considering a bias computed on train set instead of test set - totally neglectable\n",
"\n",
"score_adjusted_test = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"\n",
"print(\"mean absolute erreur\",abs(score_adjusted_test-score_adjusted_train).mean())"
]
},
{
"cell_type": "code",
"execution_count": 294,
"id": "8213d0e4-063b-49fa-90b7-677fc34f4c01",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1080/1825363704.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" X_test_segment[\"score_adjusted\"] = score_adjusted_train\n"
]
}
],
"source": [
"# adjust scores accordingly \n",
"\n",
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_test_set)\n",
"\n",
"# actually, we are not supposed to have X_test, so the biais is estimated on X_train\n",
"# X_test_segment[\"score_adjusted\"] = adjusted_score(odd_ratio(adjust_score_1(X_test_segment[\"score\"])), bias = bias_train_set)\n",
"X_test_segment[\"score_adjusted\"] = score_adjusted_train"
]
},
{
"cell_type": "code",
"execution_count": 295,
"id": "834d3723-2e72-4c65-9c62-e2d595c69461",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE for score : 0.18391062438077188\n",
"MSE for ajusted score : 0.037093800862222845\n",
"sum of y_has_purchased : 7975.0\n",
"sum of adjusted scores : 7941.695137104767\n"
]
}
],
"source": [
"# check \n",
"\n",
"MSE_score = ((X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"MSE_ajusted_score = ((X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"])**2).mean()\n",
"print(f\"MSE for score : {MSE_score}\")\n",
"print(f\"MSE for ajusted score : {MSE_ajusted_score}\")\n",
"\n",
"print(\"sum of y_has_purchased :\",y_test[\"y_has_purchased\"].sum())\n",
"print(\"sum of adjusted scores :\", X_test_segment[\"score_adjusted\"].sum())"
]
},
{
"cell_type": "code",
"execution_count": 296,
"id": "9f30a4dd-a9d8-405a-a7d5-5324ae88cf70",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MAE for score : 0.38422988971624206\n",
"MAE for adjusted score : 0.07284616452278603\n"
]
}
],
"source": [
"# mean absolute error - divided by 2 with out method\n",
"\n",
"MAE_score = abs(X_test_segment[\"score\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"MAE_ajusted_score = abs(X_test_segment[\"score_adjusted\"]-X_test_segment[\"has_purchased\"]).mean()\n",
"print(f\"MAE for score : {MAE_score}\")\n",
"print(f\"MAE for adjusted score : {MAE_ajusted_score}\")"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "6f9396db-e213-408c-a596-eaeec3bc79f3",
"metadata": {},
"outputs": [],
"source": [
"# visualization\n",
"\n",
"# histogramme des probas et des probas ajustées\n",
"\n",
"def plot_hist_scores(df, score, score_adjusted, type_of_activity) :\n",
"\n",
" plt.figure()\n",
" plt.hist(df[score], label = \"score\", alpha=0.6)\n",
" plt.hist(df[score_adjusted], label=\"adjusted score\", alpha=0.6)\n",
" plt.legend()\n",
" plt.xlabel(\"probability of a future purchase\")\n",
" plt.ylabel(\"count\")\n",
" plt.title(f\"Comparison between score and adjusted score for {type_of_activity} companies\")\n",
" # plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "def64c16-f4dd-493c-909c-d886d7f53947",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'projet-bdc2324-team1/Output_expected_CA/sport/hist_score_adjustedsport.png'"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PATH + file_name + type_of_activity + \".png\""
]
},
{
"cell_type": "code",
"execution_count": 297,
"id": "b478d40d-9677-4204-87bd-16fb0bc1fe9a",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"
"
],
"text/plain": [
" quartile score (%) score adjusted (%) has purchased (%)\n",
"0 1 17.78 0.96 0.67\n",
"1 2 36.12 2.49 2.83\n",
"2 3 63.14 7.29 7.04\n",
"3 4 86.03 29.21 29.20"
]
},
"execution_count": 298,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_table_adjusted_scores = (100 * X_test_segment.groupby(\"quartile\")[[\"score\",\"score_adjusted\", \"has_purchased\"]].mean()).round(2).reset_index()\n",
"X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f\"{col.replace('_', ' ')} (%)\" for col in X_test_table_adjusted_scores.columns if col in [\"score\",\"score_adjusted\", \"has_purchased\"]})\n",
"X_test_table_adjusted_scores"
]
},
{
"cell_type": "code",
"execution_count": 162,
"id": "d0b8740c-cf48-4a3e-83cb-23d95059f62f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'\\\\begin{tabular}{lrrr}\\n\\\\toprule\\nquartile & score (%) & score adjusted (%) & has purchased (%) \\\\\\\\\\n\\\\midrule\\n1 & 13.250000 & 2.510000 & 1.570000 \\\\\\\\\\n2 & 33.890000 & 8.000000 & 9.850000 \\\\\\\\\\n3 & 63.060000 & 22.580000 & 21.470000 \\\\\\\\\\n4 & 90.520000 & 66.200000 & 65.010000 \\\\\\\\\\n\\\\bottomrule\\n\\\\end{tabular}\\n'"
]
},
"execution_count": 162,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_test_table_adjusted_scores.to_latex(index=False)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "d6a04d3e-c454-43e4-ae4c-0746e928575b",
"metadata": {},
"outputs": [],
"source": [
"# comparison between score and adjusted score - export csv associated\n",
"\n",
"file_name = \"table_adjusted_score_\"\n",
"FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + \".csv\"\n",
"with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" X_test_table_adjusted_scores.to_csv(file_out, index = False)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"id": "a974589f-7952-4db2-bebf-7b69c6b09372",
"metadata": {},
"outputs": [],
"source": [
"def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :\n",
" \n",
" duration_ratio = duration_ref/duration_projection\n",
"\n",
" df_output = df\n",
"\n",
" df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n",
" df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n",
" \n",
" df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n",
" df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n",
"\n",
" df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n",
" \n",
" return df_output\n"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "dd8a52e1-d06e-4790-8687-8e58e3e6b84e",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1080/3982240549.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"nb_tickets_projected\"] = df_output.loc[:,nb_tickets] / duration_ratio\n",
"/tmp/ipykernel_1080/3982240549.py:8: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"total_amount_projected\"] = df_output.loc[:,total_amount] / duration_ratio\n",
"/tmp/ipykernel_1080/3982240549.py:10: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"nb_tickets_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"nb_tickets_projected\"]\n",
"/tmp/ipykernel_1080/3982240549.py:11: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"total_amount_expected\"] = df_output.loc[:,score_adjusted] * df_output.loc[:,\"total_amount_projected\"]\n",
"/tmp/ipykernel_1080/3982240549.py:13: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df_output.loc[:,\"pace_purchase\"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)\n"
]
},
{
"data": {
"text/html": [
"