Compare commits

...

243 Commits

Author SHA1 Message Date
4ed6bd809d Suppression des notebooks exploratoires et brouillons 2024-04-09 20:20:57 +00:00
9ca22fb9e7 changed names 2024-04-04 18:48:46 +00:00
6da3467108 fixed tipos 2024-04-04 18:46:38 +00:00
473f8100b0 added packages versions 2024-04-04 14:57:39 +00:00
68b68ed3da added functions documentation 2024-04-04 14:29:16 +00:00
f5b6075431 adjust font size 2024-04-04 11:46:15 +00:00
1ebb83e3c4 fix mailing consent 2024-04-04 08:58:48 +00:00
e54e6c3b10 add type of variables 2024-04-04 08:58:34 +00:00
df4c28bdd8 add function description 2024-04-04 08:39:43 +00:00
09f4bd3fe4 push coefficient 2024-04-04 06:50:49 +00:00
b9aa0d7578 fix tipo 2024-04-03 19:30:04 +00:00
5fa57cb4b9 final changes (I hope so) 2024-04-03 19:28:52 +00:00
0f5c9cb70f test 2024-04-03 19:25:43 +00:00
7bf011e2ed est 2024-04-03 19:24:59 +00:00
f4b430dbc1 test 2024-04-03 19:22:34 +00:00
7d7683b0a9 test 2024-04-03 19:21:06 +00:00
d14174dc07 test 2024-04-03 19:16:29 +00:00
c5aca36640 some changes 2024-04-03 19:15:52 +00:00
a3caa64c95 completed readme 2024-04-03 19:12:06 +00:00
15f950d87f test some changes 2024-04-03 18:37:19 +00:00
acf7621d9a fixed forecasting issues 2024-04-03 10:36:47 +00:00
14953b031a Add better dpi 2024-04-02 21:27:28 +00:00
ea3dcbb015 Amélioration graphique lazy + meilleur cadrage + enlever titre 2024-04-02 21:12:07 +00:00
091693c889 added printing options for business KPIs tables 2024-04-02 12:09:01 +00:00
197703a857 final changes for spider chart 2024-04-02 11:59:06 +00:00
41decc7acd last minor adjustment for spider chart 2024-04-02 11:47:26 +00:00
a21805db9b minor change : adjusted size of spider chart 2024-04-02 11:36:34 +00:00
21bf0c8408 use cv logit instead of benchmark for the segmentation 2024-04-02 11:26:06 +00:00
4e74483a69 augmentation résolution des graphiques 2024-04-01 10:19:59 +00:00
c96e1b5f0c logit cv au lieu de benchmark 2024-04-01 01:18:53 +00:00
52b39e03be final changes for spider charts 2024-03-31 21:59:52 +00:00
b9a3d05a2f tests to prepare changes in code 06 2024-03-31 17:57:10 +00:00
1a62d2b60a Changement path 2024-03-31 17:16:46 +00:00
e5c99f09ab Changement dossier 2024-03-31 17:03:49 +00:00
1577cc3291 Correction path 2024-03-31 17:02:33 +00:00
ad1e9034f7 Changement nom et path 2024-03-31 16:54:46 +00:00
8e61e9d2a4 Ajout description marketing personae 2024-03-31 16:35:58 +00:00
7341752be0 Changement nom fichier 2024-03-31 16:35:21 +00:00
35638f2a2d Passage à demande input à boucle sur activite 2024-03-31 16:34:55 +00:00
0a7900c07f take new databases as input 2024-03-30 11:00:49 +00:00
78aab14164 added age importation 2024-03-29 12:43:36 +00:00
8485bd755e Merge pull request 'generalization' (#16) from generalization into main
Reviewed-on: #16
2024-03-29 11:15:55 +01:00
354f6847b6 standard model 2024-03-29 10:15:28 +00:00
d6e2b2c57a fix path 2024-03-29 10:14:14 +00:00
42b4414a16 Changement architecture p1 2024-03-28 21:18:08 +00:00
3d6414728c Merge branch 'main' into segment_value 2024-03-28 20:44:06 +00:00
7be4179de4 added README 2024-03-28 16:48:22 +00:00
4facf5567c Merge pull request 'fix time to open' (#15) from generalization into main
Reviewed-on: #15
2024-03-28 15:23:42 +01:00
7ed8516009 fix time to open 2024-03-28 13:17:45 +00:00
56ee61e25f adjusted graphic options 2024-03-28 13:13:13 +00:00
0aed0911a1 Merge pull request 'fix loading' (#14) from generalization into main
Reviewed-on: #14
2024-03-28 12:39:50 +01:00
d5ab3c2d68 fix loading 2024-03-28 11:39:16 +00:00
0bd29e3a81 Merge pull request 'fix preproc' (#13) from generalization into main
Reviewed-on: #13
2024-03-28 12:38:00 +01:00
840ce876e2 fix preproc 2024-03-28 11:37:33 +00:00
b268cd980d Merge pull request 'fix premium' (#12) from generalization into main
Reviewed-on: #12
2024-03-28 12:19:21 +01:00
02a4ea20dd fix premium 2024-03-28 11:19:05 +00:00
25a356d6a4 tested adjustment of scores for different models 2024-03-28 10:37:23 +00:00
f0f69d710a added activity in the titles of graphics 2024-03-28 09:27:29 +00:00
eb87cc6998 Merge pull request 'generalization' (#11) from generalization into main
Reviewed-on: #11
2024-03-28 09:40:04 +01:00
7debe6590e fix conflicts 2024-03-28 08:38:43 +00:00
122c4c1f82 fix features 2024-03-28 08:35:02 +00:00
ebdbacbe34 fix features 2024-03-28 07:56:36 +00:00
adc62dd056 save at different steps 2024-03-28 07:37:10 +00:00
6d40cfe261 Ajout exploration is_partner 2024-03-27 22:21:26 +00:00
10824e5e24 CA estimation by segment works well 2024-03-27 18:59:05 +00:00
be0bcda0ba Complétion brouillon 2024-03-27 18:55:11 +00:00
0ffbe06b12 Ajout graphiques sur targets 2024-03-27 18:39:54 +00:00
d3e13f4c56 completed CA projection 2024-03-27 17:58:30 +00:00
cf0b33c940 added input to select type of activity 2024-03-27 15:57:24 +00:00
905072b1db now file works well 2024-03-27 15:42:27 +00:00
bfa941f0a3 minor change 2024-03-27 15:20:28 +00:00
8d33c74d2f activity changed to type of activity 2024-03-27 15:10:57 +00:00
f55ade48b4 minor change 2024-03-27 15:06:31 +00:00
10fde045e5 from notebook to .py for segment analysis 2024-03-27 14:59:33 +00:00
133eb83e84 add path premium 2024-03-27 14:08:40 +00:00
f4b6f23394 Merge branch 'main' into generalization 2024-03-27 09:07:21 +00:00
38c3fc3148 Ajout tableau recap des target detectés 2024-03-26 22:01:33 +00:00
e2d55e557e Inutile car repris dans la pipeline 2024-03-26 22:01:11 +00:00
dd5c3f416b v2 2024-03-26 21:07:29 +00:00
28cc7b94ea added options to save plots in S3 2024-03-26 15:00:39 +00:00
2165c7c16e completed segment mp analysis sport 2024-03-26 11:20:03 +00:00
5e37dd4d3d Ajout graph 2024-03-26 10:51:02 +00:00
e1f6f1ba68 Ajout graph pour targets 2024-03-26 10:49:09 +00:00
c620f23507 Enlever drop pour description segmentation 2024-03-26 10:48:50 +00:00
a32cbe70e4 Add variables for segmentation description 2024-03-25 17:44:31 +00:00
c86c43cc7e Nouveau datasets 2024-03-24 19:01:29 +00:00
f5f993aba0 On enlève les variables pré-calculés 2024-03-24 18:43:30 +00:00
dbd87dadd9 fixed export hist.png issue 2024-03-24 10:44:22 +00:00
ba6c4a8a24 update notebook 2024-03-24 10:05:28 +00:00
c549752ba7 added exportation to MinIo option 2024-03-24 09:42:44 +00:00
ca30d1daa3 update CA segment analysis 2024-03-23 16:23:59 +00:00
7a9548f295 Ajout variables KPI targets 2024-03-23 11:51:18 +00:00
1a0a5a40cf Ajout analyses 2024-03-23 09:48:47 +00:00
c1cb3ab396 added utils for CA estimation 2024-03-23 09:18:43 +00:00
1c8e19a70d clean notebook : adjust scores and estimates ca 2024-03-22 23:04:49 +00:00
a88c2df8f5 exploratory step : CA by segment 2024-03-22 09:15:59 +00:00
33df2fda4f added summary with weights 2024-03-21 13:21:40 +00:00
52fd738fe5 fix errors 2024-03-21 10:47:40 +00:00
a85036ad23 added summary for logit with penalty 2024-03-21 08:18:31 +00:00
089a8fd3d6 fix labels 2024-03-21 08:16:29 +00:00
b1e877508b Merge branch 'main' into generalization 2024-03-21 07:14:51 +00:00
9763dfe7f9 add result by companies 2024-03-21 07:10:10 +00:00
a0256c551b Merge pull request 'generalization' (#9) from generalization into main
Reviewed-on: #9
2024-03-20 21:26:04 +01:00
5cd1bcc222 add probability 2024-03-20 13:07:33 +00:00
3d03965084 commit segmentation 2024-03-20 12:07:24 +00:00
605876dfb1 save model to pickle 2024-03-20 12:06:47 +00:00
fbfc03a572 look at graph 2024-03-20 09:27:03 +00:00
95c4c6c4bf fix errors 2024-03-20 08:33:56 +00:00
0a41641956 stat desc 2024-03-19 13:54:31 +00:00
57cc7d077d Merge branch 'main' into generalization 2024-03-19 12:06:58 +00:00
d328caa665 add pipeline ML 2024-03-19 11:46:04 +00:00
ee86fcaf84 reduce random forest param grid 2024-03-19 11:43:44 +00:00
ef23181a05 Ajout achat par mois 2024-03-18 21:50:30 +00:00
9e5e364aa3 add steps 2024-03-18 19:38:01 +00:00
969cb8ec43 add machine learning automatisation 2024-03-18 16:23:52 +00:00
9155b397e9 utils_ml 2024-03-18 16:22:29 +00:00
6ac62d9957 Merge branch 'main' into generalization 2024-03-18 16:21:42 +00:00
52119c4354 added segement 2024-03-18 15:58:38 +00:00
2bd3edb444 added segmentation to the model 2024-03-18 15:49:36 +00:00
b892ca79c7 added segementation to the model 2024-03-18 15:47:05 +00:00
9a0ac320d0 add benchmark random forest 2024-03-18 09:35:48 +00:00
5408ce677b add calibration curve 2024-03-18 09:10:28 +00:00
6eddec93bc completed with random forest + naive bayes 2024-03-17 11:49:48 +00:00
cc30d7deb9 Ajout KPI sur customerplus 2024-03-16 17:20:47 +00:00
ab3b033f09 delete former logit pipeline file 2024-03-16 15:18:21 +00:00
746f764973 finished : logit pipeline + visu (sports) 2024-03-16 15:16:34 +00:00
4c7bdf712b Correction + fait tourner customerplus 2024-03-16 15:01:52 +00:00
f8dc99df99 Merge branch 'correction_variables' 2024-03-16 14:48:32 +00:00
53f32000b5 Changement sur customerplus 2024-03-16 14:47:46 +00:00
14423b1d34 init full modelization 2024-03-16 10:43:11 +00:00
83a3c039ec baseline logit - exploratory study of variables 2024-03-16 09:42:41 +00:00
15c102682a fix errors 2024-03-14 23:02:50 +00:00
3670299a0b Ajout brouillon 2024-03-14 22:35:25 +00:00
dc5e3d0df1 correction, renommer, remplir NaN pour tickets et mail 2024-03-14 22:34:36 +00:00
db6eaaaa8d debug 2024-03-14 21:14:40 +00:00
54fbad0344 identify target customer 2024-03-14 21:00:14 +00:00
6d0f67bd31 fix filter customer 2024-03-14 19:11:09 +00:00
4ac11c6b37 fix some plots 2024-03-14 19:04:03 +00:00
d42e81449a generalize statistics 2024-03-14 18:35:03 +00:00
ac6a3b365f created logit statsmodels (spectacle) 2024-03-14 11:10:15 +00:00
6db4af19ce Ajout debut cleaning target name 2024-03-13 22:24:38 +00:00
1c9e7e1778 Stat desc 101 2024-03-13 13:25:21 +00:00
96088d19cb Update 2024-03-12 19:15:04 +00:00
35d9965671 added sales trend graphic 2024-03-12 15:16:28 +00:00
03acb304f4 harmonize gender plot 2024-03-12 12:48:42 +00:00
8acc32de70 fixed graphic saving issues 2024-03-11 18:10:46 +00:00
99c7836182 added saving options for the graphics 2024-03-11 17:43:56 +00:00
ab83b7c20c update stats desc music 2024-03-11 10:40:29 +00:00
b35e04e307 run pipeline on music companies 2024-03-11 08:58:12 +00:00
c502e2fd64 logistic baseline 2024-03-11 08:36:25 +00:00
f0e9973533 Ajout modelisation et segmentation 2024-03-10 21:31:37 +00:00
e96e5a2b08 stat 2024-03-10 20:00:29 +00:00
921c2c796b stat 2024-03-10 19:30:08 +00:00
858d1d2111 stat 2024-03-10 18:49:34 +00:00
a4d4803a1c stat 2024-03-10 18:08:50 +00:00
75664a33d7 stat 2024-03-10 16:41:43 +00:00
4aa781daf0 Correction problème méthodologique de construction 2024-03-10 16:04:16 +00:00
f40ae6ead0 Correction erreur de construction des datasets de modelisation 2024-03-10 15:30:23 +00:00
27ef78a486 fix pipeline 2024-03-10 12:30:10 +00:00
c2c749be3d Merge pull request 'generalization' (#8) from generalization into main
Reviewed-on: #8
2024-03-10 12:31:36 +01:00
aabf858c6c update stats desc spectacles 2024-03-10 11:31:28 +00:00
adc1da3e49 adjust pipeline 2024-03-10 11:30:57 +00:00
58c7cac17f work on pipeline 2024-03-10 10:09:53 +00:00
198ef45247 Merge branch 'main' into generalization 2024-03-10 08:46:23 +00:00
0eedea6e26 Changement dossier 2024-03-09 17:50:46 +00:00
c6abfbe76e Correction construction 2024-03-09 17:50:32 +00:00
0b56e4e696 Ajout dossier musée + debut modelisation 2024-03-09 17:49:45 +00:00
14922dccfa Ajout des graphiques communs 2024-03-09 16:20:32 +00:00
11e2e86583 Ajout et sauvegarde 2024-03-09 14:50:58 +00:00
d3fa9f6870 prepare Pipeline 2024-03-08 13:48:38 +00:00
bb684633d7 work on stat desc 2024-03-08 13:26:27 +00:00
ced4747372 work on stat desc 2024-03-08 13:13:18 +00:00
3088e5f337 update stats desc 2024-03-08 09:30:12 +00:00
9a2316e843 update stats desc 2024-03-08 07:44:28 +00:00
e426a86b55 Ancien fichier .py 2024-03-07 18:33:41 +00:00
af84a57c54 Fix conflicts 2024-03-07 13:05:49 +00:00
a49d14853e Merge pull request 'generalization' (#7) from generalization into main
Reviewed-on: #7
2024-03-07 13:58:03 +01:00
c7ca8c560e fix conflicts 2024-03-07 12:57:21 +00:00
20fa01647a test train 2024-03-06 12:42:55 +00:00
bed6a5c901 fix condition 2024-03-06 12:42:39 +00:00
41f49edd1c explore sport 2024-03-06 11:49:51 +00:00
d8e2da70cb fix path + test and train customer allocation' 2024-03-06 11:49:37 +00:00
4503114435 work on stat 2024-03-06 10:56:52 +00:00
3ec803d0a6 Modification avec 101 2024-03-05 19:26:41 +00:00
1308484706 Fixation des dates de datasets 2024-03-05 17:46:06 +00:00
23551d29d2 stat 2024-03-05 14:50:46 +00:00
473afd9a89 ajout_indicateur 2024-03-05 14:37:29 +00:00
eaf1884bb6 anova 2024-03-05 13:36:03 +00:00
29ac99df14 test_anova 2024-03-05 13:34:43 +00:00
de342a2f77 Merge branch 'main' into generalization 2024-03-05 11:27:41 +00:00
da1f16d8ec Ajout de statistiques sur les tags 2024-03-05 10:57:40 +00:00
1667f99a83 stat 2024-03-05 02:57:08 +00:00
2bf81015ac stat 2024-03-05 02:51:39 +00:00
0052d4e78f stat 2024-03-05 02:25:59 +00:00
8f5abf52fd stat 2024-03-05 02:15:03 +00:00
1ec5b8743f code 2024-03-05 01:44:01 +00:00
dbb90fb364 stat 2024-03-05 01:43:40 +00:00
66754f957e base_test_train 2024-03-04 23:36:48 +00:00
71a5cb2a3e Ajout exploration 2024-03-04 22:30:25 +00:00
688410299f work on stat desc 2024-03-04 18:29:21 +00:00
286bd9cb85 work on stat desc sport 2024-03-04 15:55:58 +00:00
6e5383f594 Ajout observations des sites et graph sur proportion de billets gratuits 2024-03-03 22:33:35 +00:00
228e626ba8 stats + graphs campaigns_info et customer_plus 2024-03-03 08:32:45 +00:00
51b7844358 Ajout statistiques descriptives sur les musées 2024-03-02 16:29:14 +00:00
020b092b04 stat 2024-03-02 13:05:48 +00:00
9dd9bd45e2 stat_desc_finale 2024-03-02 12:32:54 +00:00
169ce53c88 base_stat_desc 2024-03-02 12:05:51 +00:00
a3016ce78e stat_des 2024-03-02 11:16:24 +00:00
3d4e661be9 stat 2024-03-02 10:37:44 +00:00
b54b726cf3 deplacement de fichiers 2024-03-02 08:59:32 +00:00
9cca31377f déplacement de fichiers 2024-03-02 08:58:05 +00:00
ccddaf2f12 Merge pull request 'generalization' (#6) from generalization into main
Reviewed-on: #6
2024-02-29 20:26:00 +01:00
1d1594fc26 SPECT 2024-02-29 14:17:36 +00:00
9c0aff85e5 fix path 2024-02-29 11:12:46 +00:00
766463acaa work on DS dataset 2024-02-29 10:14:16 +00:00
b840b2403c Generate global modelization datasets 2024-02-29 09:25:40 +00:00
b71f842fe4 add prefix to customer_id 2024-02-29 09:09:04 +00:00
979c0fe6a8 fix path 2024-02-29 08:33:05 +00:00
2fabf98413 Quelques observations 2024-02-28 20:57:28 +00:00
80a8642484 add fill NaN 2024-02-28 20:38:42 +00:00
12427e7b18 stat 2024-02-28 05:51:50 +00:00
9097a1194d creation_kpi 2024-02-28 04:06:07 +00:00
bba4820dd8 base_spectacle 2024-02-28 02:31:01 +00:00
9a06cbe96f added open function 2024-02-28 01:57:28 +00:00
e522615a8f spectacle 2024-02-28 01:52:06 +00:00
3c4f851d16 Correction erreur pour entreprise 101 2024-02-27 21:52:13 +00:00
23981e3cbc Modification de la partie product purchased : ajout start et end date, open + cleaning de la base ticket_1 de l'entreprise 101 2024-02-27 21:01:20 +00:00
d0c980f788 Ajout exploration des tags 2024-02-26 21:47:36 +00:00
c9089de56c ajout dossier + rangement notebooks 2024-02-26 15:51:31 +00:00
027ba3671e ajout dossiers - rangement des notebooks 2024-02-26 15:49:40 +00:00
283c675448 Ajout observation target 2024-02-25 22:53:10 +00:00
716002bdcf Exploration tags et target 2024-02-25 17:33:24 +00:00
fa9c1c790e add kpi function for customerplus 2024-02-25 17:31:14 +00:00
27e266c58e ajout random forest + visu des perfs + pickles 2024-02-23 18:57:05 +00:00
282d6cd8a5 Merge pull request 'generalization' (#5) from generalization into main
Reviewed-on: #5
2024-02-22 23:01:53 +01:00
79dc4f13ff generate train and test dataset for all companies 2024-02-22 14:57:34 +00:00
44fef6d618 investigate sport companies 2024-02-22 14:56:54 +00:00
71c5d86679 handle na for supplier 2024-02-22 14:56:00 +00:00
c26b5b11d8 Update 2024-02-21 22:08:33 +00:00
29eafcc6b2 Merge branch 'data_construction' 2024-02-20 22:46:14 +00:00
1f0892434f Valeur_manquante 2024-02-20 01:27:57 +00:00
2b4723e271 IDENTIFICATION 2024-02-20 01:27:30 +00:00
06d0223235 identification_entreprise_type_event 2024-02-20 01:03:23 +00:00
30 changed files with 2762 additions and 34698 deletions

View File

@ -1,128 +0,0 @@
# Business Data Challenge - Team 1
import pandas as pd
import numpy as np
import os
import s3fs
import re
import warnings
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# Import cleaning and merge functions
exec(open('0_KPI_functions.py').read())
# Ignore warning
warnings.filterwarnings('ignore')
def dataset_construction(min_date, end_features_date, max_date, directory_path):
# Import customerplus
df_customerplus_clean = display_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
# Filtre de cohérence pour la mise en pratique de notre méthode
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
#Filtre de la base df_campaigns_information
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
#Filtre de la base df_products_purchased_reduced
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
print("Data filtering : SUCCESS")
# Fusion de l'ensemble et creation des KPI
# KPI sur les campagnes publicitaires
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
# KPI sur le comportement d'achat
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
# KPI sur les données socio-demographique
## Le genre
df_customerplus_clean["gender_label"] = df_customerplus_clean["gender"].map({
0: 'female',
1: 'male',
2: 'other'
})
gender_dummies = pd.get_dummies(df_customerplus_clean["gender_label"], prefix='gender').astype(int)
df_customerplus_clean = pd.concat([df_customerplus_clean, gender_dummies], axis=1)
## Indicatrice si individue vit en France
df_customerplus_clean["country_fr"] = df_customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
print("KPIs construction : SUCCESS")
# Fusion avec KPI liés au customer
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
# Fusion avec KPI liés au comportement d'achat
df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')
# Fill NaN values
df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)
print("Explanatory variable construction : SUCCESS")
# 2. Construction of the explained variable
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]
# Indicatrice d'achat
df_products_purchased_to_predict['y_has_purchased'] = 1
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
print("Explained variable construction : SUCCESS")
# 3. Merge between explained and explanatory variables
dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
# 0 if there is no purchase
dataset[['y_has_purchased']].fillna(0)
return dataset
## Exportation
# Dossier d'exportation
BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach"
# Dataset test
dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1")
# # Exportation
# FILE_KEY_OUT_S3 = "dataset_test.csv"
# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
# dataset_test.to_csv(file_out, index = False)
# print("Exportation dataset test : SUCCESS")
# Dataset train
dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1")
# Exportation
# FILE_KEY_OUT_S3 = "dataset_train.csv"
# FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3
# with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
# dataset_train.to_csv(file_out, index = False)
# print("Exportation dataset train : SUCCESS")
print("FIN DE LA GENERATION DES DATASETS : SUCCESS")

File diff suppressed because it is too large Load Diff

View File

@ -1,97 +0,0 @@
# Function de construction de KPI
def custom_date_parser(date_string):
return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
def display_databases(directory_path, file_name, datetime_col = None):
"""
This function returns the file from s3 storage
"""
file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv"
print("File path : ", file_path)
with fs.open(file_path, mode="rb") as file_in:
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
return df
def campaigns_kpi_function(campaigns_information = None):
# Nombre de campagnes de mails
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
# Temps d'ouverture en min moyen
campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
# Nombre de mail ouvert
opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]
opened_campaign.dropna(subset=['opened_at'], inplace=True)
opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)
# Fusion des indicateurs
campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')
campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')
# Remplir les NaN : nb_campaigns_opened
campaigns_reduced['nb_campaigns_opened'].fillna(0)
# Remplir les NaT : time_to_open (??)
return campaigns_reduced
def tickets_kpi_function(tickets_information = None):
tickets_information_copy = tickets_information.copy()
# Dummy : Canal de vente en ligne
liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)
# Proportion de vente en ligne
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index()
prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)
# Average amount
# avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
# .agg({"amount" : "mean"}).reset_index()
# .rename(columns = {'amount' : 'avg_amount'}))
tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
.groupby(['customer_id'])
.agg({'ticket_id': 'count',
'purchase_id' : 'nunique',
'amount' : 'sum',
'supplier_name': 'nunique',
'vente_internet' : 'max',
'purchase_date' : ['min', 'max']})
.reset_index()
)
tickets_kpi.columns = tickets_kpi.columns.map('_'.join)
tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets',
'purchase_id_nunique' : 'nb_purchases',
'amount_sum' : 'total_amount',
'supplier_name_nunique' : 'nb_suppliers',
'customer_id_' : 'customer_id'}, inplace = True)
tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
# Convertir date et en chiffre
max_date = tickets_kpi['purchase_date_max'].max()
tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')
tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)
# tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
return tickets_kpi

File diff suppressed because one or more lines are too long

View File

@ -6,13 +6,14 @@ import os
import s3fs
import re
import warnings
import time
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# Import cleaning and merge functions
exec(open('0_Cleaning_and_merge_functions.py').read())
exec(open('utils_cleaning_and_merge.py').read())
# Output folder
BUCKET_OUT = "projet-bdc2324-team1"
@ -20,15 +21,20 @@ BUCKET_OUT = "projet-bdc2324-team1"
# Ignore warning
warnings.filterwarnings('ignore')
start_all = time.time()
def export_dataset(df, output_name):
print('Exportation of dataset :', output_name)
print('Export of dataset :', output_name)
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + output_name
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
df.to_csv(file_out, index = False)
## 1 - Cleaning of the datasets
for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"):
for tenant_id in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"]:#, "101"
# Timer
start = time.time()
# Cleaning customerplus
df1_customerplus_clean = preprocessing_customerplus(directory_path = tenant_id)
@ -45,14 +51,22 @@ for tenant_id in ("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
## Exportation
export_dataset(df = df1_campaigns_information, output_name = "0_Input/Company_"+ tenant_id +"/campaigns_information.csv")
## Exportation
# export_dataset(df = df1_campaigns_information, output_name = "0_Temp/Company 1 - Campaigns dataset clean.csv")
if tenant_id == "101":
# Cleaning product area
products_purchased_reduced, products_purchased_reduced_1 = uniform_product_df(directory_path = tenant_id)
# Exportation
export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
export_dataset(df = products_purchased_reduced_1, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced_1.csv")
else :
# Cleaning product area
products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
# Exportation
export_dataset(df = products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
# Cleaning product area
df1_products_purchased_reduced = uniform_product_df(directory_path = tenant_id)
## Exportation
export_dataset(df = df1_products_purchased_reduced, output_name = "0_Input/Company_"+ tenant_id +"/products_purchased_reduced.csv")
#Exportation
# export_dataset(df = df1_products_purchased_reduced, output_name = "0_Temp/Company 1 - Purchases.csv")
# export_dataset(df = df1_products_purchased_reduced, output_name = "1_Temp/Company 1 - Purchases.csv")
print("Time to run the cleaning of company ", tenant_id , " : " ,time.time() - start)
print("\n ------------------------------------------------------------------ \n --------------------- END CLEANING COMPANY " + tenant_id + " --------------------- \n ------------------------------------------------------------------")
print("Time to run the cleaning of all used datasets : " , time.time() - start_all)

176
2_Datasets_Generation.py Normal file
View File

@ -0,0 +1,176 @@
# Purpose of the script : Construction of training and test datasets for modelling by company
# Input : KPI construction function and clean databases in the 0_Input folder
# Output : Train and test datasets by compagnies
# Packages
import pandas as pd
import numpy as np
import os
import s3fs
import re
import warnings
from datetime import date, timedelta, datetime
from sklearn.model_selection import train_test_split
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# Import KPI construction functions
exec(open('utils_features_construction.py').read())
# Ignore warning
warnings.filterwarnings('ignore')
def dataset_construction(min_date, end_features_date, max_date, directory_path):
# Import of cleaned and merged datasets
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_target_information = display_input_databases(directory_path, file_name = "target_information")
# Dates in datetime format
max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601')
end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')
min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')
# Filter for database df_campaigns_information
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] < end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')
# Filter for database df_products_purchased_reduced
df_products_purchased_features = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
print("Data filtering : SUCCESS")
# Building and merging features
# Campaigns features
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
# Purchasing behavior features
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)
# Socio-demographic features
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
# Targets features
df_targets_kpi = targets_KPI(df_target = df_target_information)
print("KPIs construction : SUCCESS")
# Merge - campaigns features
df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = df_customer[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
df_customer['time_to_open'] = df_customer['time_to_open'].fillna(df_customer['time_to_open'].mean())
# Merge - targets features
df_customer = pd.merge(df_customer, df_targets_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
targets_columns = list(df_targets_kpi.columns)
targets_columns.remove('customer_id')
df_customer[targets_columns] = df_customer[targets_columns].fillna(0)
# We standardise the number of targets closely linked to the company's operations
df_customer['nb_targets'] = (df_customer['nb_targets'] - (df_customer['nb_targets'].mean())) / (df_customer['nb_targets'].std())
# Merge - purchasing behavior features
df_customer_product = pd.merge(df_customer, df_tickets_kpi, on = 'customer_id', how = 'left')
# Fill NaN values
special_fill_nan = ['customer_id', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase']
simple_fill_nan = [column for column in list(df_tickets_kpi.columns) if column not in special_fill_nan]
df_customer_product[simple_fill_nan] = df_customer_product[simple_fill_nan].fillna(0)
max_interval = (end_features_date - min_date) / np.timedelta64(1, 'D') + 1
df_customer_product[['purchase_date_max', 'purchase_date_min']] = df_customer_product[['purchase_date_max', 'purchase_date_min']].fillna(max_interval)
df_customer_product[['time_between_purchase']] = df_customer_product[['time_between_purchase']].fillna(-1)
# Customers who have neither received an e-mail nor made a purchase during the feature estimation period are removed
df_customer_product = df_customer_product[(df_customer_product['nb_purchases'] > 0) | (df_customer_product['nb_campaigns'] > 0)]
print("Explanatory variable construction : SUCCESS")
# 2. Construction of the explained variable
df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] < max_date) & (df_products_purchased_reduced['purchase_date'] >= end_features_date)]
# Construction of the dependant variable
df_products_purchased_to_predict['y_has_purchased'] = 1
y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()
print("Explained variable construction : SUCCESS")
# 3. Merge between explained and explanatory variables
dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')
# 0 if there is no purchase
dataset[['y_has_purchased']] = dataset[['y_has_purchased']].fillna(0)
# add id_company prefix to customer_id
dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')
return dataset
## Exportation
# Sectors
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
'sport': ['5', '6', '7', '8', '9'],
'musique' : ['10', '11', '12', '13', '14']}
# Choosed sector
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
list_of_comp = companies[type_of_comp]
# Export folder
BUCKET_OUT = f'projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}'
# Dates used for the construction of features and the dependant variable
start_date = "2021-05-01"
end_of_features = "2022-11-01"
final_date = "2023-11-01"
# Anonymous customer to be deleted from the datasets
anonymous_customer = {'1' : '1_1', '2' : '2_12184', '3' : '3_1', '4' : '4_2', '101' : '101_1',
'5' : '5_191835', '6' : '6_591412', '7' : '7_49632', '8' : '8_1942', '9' : '9_19683',
'10' : '10_19521', '11' : '11_36', '12' : '12_1706757', '13' : '13_8422', '14' : '14_6354'}
for company in list_of_comp:
dataset = dataset_construction(min_date = start_date, end_features_date = end_of_features,
max_date = final_date, directory_path = company)
# Deletion of the anonymous customer
dataset = dataset[dataset['customer_id'] != anonymous_customer[company]]
# Split between train and test
dataset_train, dataset_test = train_test_split(dataset, test_size=0.3, random_state=42)
# Dataset Test
# Export
FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Test_set/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
dataset_test.to_csv(file_out, index = False)
print("Export of dataset test : SUCCESS")
# Dataset train
# Export
FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/Train_set/" + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
dataset_train.to_csv(file_out, index = False)
print("Export of dataset train : SUCCESS")
print("End of dataset generation for ", type_of_comp," compagnies : SUCCESS")

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

68
3_Modelling_Datasets.py Normal file
View File

@ -0,0 +1,68 @@
# Business Data Challenge - Team 1
import pandas as pd
import numpy as np
import os
import s3fs
import re
import warnings
from datetime import date, timedelta, datetime
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# Import KPI construction functions
exec(open('utils_features_construction.py').read())
# Ignore warning
warnings.filterwarnings('ignore')
# functions
def generate_test_set(type_of_comp):
file_path_list = fs.ls(f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}/Test_set")
test_set = pd.DataFrame()
for file in file_path_list:
print(file)
with fs.open(file, mode="rb") as file_in:
df = pd.read_csv(file_in, sep=",")
test_set = pd.concat([test_set, df], ignore_index = True)
return test_set
def generate_train_set(type_of_comp):
file_path_list = fs.ls(f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}/Train_set")
train_set = pd.DataFrame()
for file in file_path_list:
print(file)
with fs.open(file, mode="rb") as file_in:
df = pd.read_csv(file_in, sep=",")
train_set = pd.concat([train_set, df], ignore_index = True)
return train_set
type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
BUCKET_OUT = f'projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_comp}/'
# create test and train datasets
test_set = generate_test_set(type_of_comp)
train_set = generate_train_set(type_of_comp)
# Exportation test set
FILE_KEY_OUT_S3 = "Test_set.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
test_set.to_csv(file_out, index = False)
print("Exportation dataset test : SUCCESS")
# Exportation train set
FILE_KEY_OUT_S3 = "Train_set.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + FILE_KEY_OUT_S3
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
train_set.to_csv(file_out, index = False)
print("Exportation dataset train : SUCCESS")

View File

@ -0,0 +1,82 @@
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import warnings
from datetime import date, timedelta, datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
# Ignore warning
warnings.filterwarnings('ignore')
exec(open('utils_features_construction.py').read())
exec(open('utils_stat_desc.py').read())
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
companies = {'musee' : ['1', '2', '3', '4'], # , '101'
'sport': ['5', '6', '7', '8', '9'],
'musique' : ['10', '11', '12', '13', '14']}
# type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
for type_of_activity in ['musee', 'sport', 'musique'] :
list_of_comp = companies[type_of_activity]
# Load files
customer, campaigns_kpi, campaigns_brut, tickets, products, targets = load_files(list_of_comp)
# Identify anonymous customer for each company and remove them from our datasets
outlier_list = outlier_detection(tickets, list_of_comp)
# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
customer_valid_list = valid_customer_detection(products, campaigns_brut)
databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
for dataset in databases:
dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
#print(f'shape of {dataset} : ', dataset.shape)
# Identify customer who bought during the period of y
customer_target_period = identify_purchase_during_target_periode(products)
customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
# Generate graph and automatically saved them in the bucket
compute_nb_clients(customer, type_of_activity)
#maximum_price_paid(customer, type_of_activity)
target_proportion(customer, type_of_activity)
mailing_consent(customer, type_of_activity)
mailing_consent_by_target(customer, type_of_activity)
gender_bar(customer, type_of_activity)
country_bar(customer, type_of_activity)
lazy_customer_plot(campaigns_kpi, type_of_activity)
campaigns_effectiveness(customer, type_of_activity)
sale_dynamics(products, campaigns_brut, type_of_activity)
tickets_internet(tickets, type_of_activity)
already_bought_online(tickets, type_of_activity)
box_plot_price_tickets(tickets, type_of_activity)
target_description(targets, type_of_activity)

87
5_Modelling.py Normal file
View File

@ -0,0 +1,87 @@
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
import pickle
import warnings
exec(open('utils_ml.py').read())
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)
# choose the type of companies for which you want to run the pipeline
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
# choose the type of model
type_of_model = input('Choisissez le type de model : standard ? premium ?')
# load train and test set
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
dataset_train, dataset_test = load_train_test(type_of_activity, type_of_model)
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
print("Shape train : ", X_train.shape)
print("Shape test : ", X_test.shape)
# processing
weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),
y = y_train['y_has_purchased'])
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
preproc = preprocess(type_of_model, type_of_activity)
# Object for storing results
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
# Naive Bayes
model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Naive Bayes : Done")
# Logistic Regression
model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
print("Logistic : Done")
model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Logistic CV : Done")
# Random Forest
model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Random Forest : Done")
model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
print("Random Forest CV: Done")
# Save result
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)

View File

@ -0,0 +1,86 @@
# Packages
import pandas as pd
import numpy as np
import os
import io
import s3fs
import re
import pickle
import warnings
import matplotlib.pyplot as plt
from tabulate import tabulate
###################################
# choose the model we use for the segmentation
# model_name = "LogisticRegression_Benchmark"
model_name = "LogisticRegression_cv"
###################################
# execute file including functions we need
exec(open('utils_segmentation.py').read())
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# choose the type of companies for which you want to run the pipeline
# type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
for type_of_activity in ['musee', 'sport', 'musique'] :
# load test set
dataset_test = load_test_file(type_of_activity)
# Load Model
model = load_model(type_of_activity, model_name)
### Preprocessing of data
X_test = dataset_test.drop(columns = 'y_has_purchased')
y_test = dataset_test[['y_has_purchased']]
X_test_segment = X_test
# add y_has_purchased to X_test
X_test_segment["has_purchased"] = y_test
# Add prediction and probability to dataset_test
y_pred = model.predict(X_test)
X_test_segment["has_purchased_estim"] = y_pred
y_pred_prob = model.predict_proba(X_test)[:, 1]
X_test_segment['score'] = y_pred_prob
X_test_segment["segment"] = np.where(X_test_segment['score']<0.25, '1',
np.where(X_test_segment['score']<0.5, '2',
np.where(X_test_segment['score']<0.75, '3', '4')))
### 1. business KPIs
business_var = ["nb_tickets", "nb_purchases", "total_amount", "nb_campaigns"]
X_test_business_fig = df_business_fig(X_test_segment, "segment", business_var)
print(f"business figures for {type_of_activity} companies :\n")
print(X_test_business_fig)
print("\n")
# save histogram to Minio
hist_segment_business_KPIs(X_test_business_fig, "segment", "size", "nb_tickets",
"nb_purchases", "total_amount", "nb_campaigns", type_of_activity)
save_file_s3_mp(File_name = "segments_business_KPI_", type_of_activity = type_of_activity)
### 2. description of marketing personae
## A. Spider chart
radar_mp_plot_all(df = X_test_segment, type_of_activity = type_of_activity)
save_file_s3_mp(File_name = "spider_chart_all_", type_of_activity = type_of_activity)
## B. Latex table
known_sociodemo_caracteristics(df = X_test_segment, type_of_activity = type_of_activity)

112
7_Sales_Forecast.py Normal file
View File

@ -0,0 +1,112 @@
# importations
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
import s3fs
import matplotlib.pyplot as plt
from scipy.optimize import fsolve
import pickle
import warnings
import io
# ignore warnings
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# importation of functions defined
exec(open('utils_sales_forecast.py').read())
# from utils_CA_segment import *
# define type of activity
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/"
# type of model for the score
type_of_model = "LogisticRegression_cv"
# type_of_model = "LogisticRegression_Benchmark"
# load train and test sets
dataset_train, dataset_test = load_train_test(type_of_activity)
# make features - define X train and X test
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
# choose model - logit cross validated
model = load_model(type_of_activity, type_of_model)
# create table X test segment from X test
X_test_segment = df_segment(X_test, y_test, model)
# comparison with bias of the train set - X train to be defined
X_train_score = model.predict_proba(X_train)[:, 1]
bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),
y_objective = y_train["y_has_purchased"].sum(),
initial_guess=10)
print("Bias estimated :", np.log(bias_train_set))
# create a score adjusted with the bias computed
score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
X_test_segment["score_adjusted"] = score_adjusted_train
print("The score was successfully adjusted")
MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean()
MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean()
print(f"MAE for score : {MAE_score}")
print(f"MAE for adjusted score : {MAE_ajusted_score}")
### 1. plot adjusted scores and save (to be tested)
plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
save_file_s3_ca("hist_score_adjusted_", type_of_activity)
### 2. comparison between score and adjusted score
X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
print("Table of scores :\n")
print(X_test_table_adjusted_scores)
print("\n")
# save table
file_name = "table_adjusted_score_"
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
X_test_table_adjusted_scores.to_csv(file_out, index = False)
# project revenue
X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted",
duration_ref=17, duration_projection=12)
### 3. table summarizing projections (nb tickets, revenue)
"""
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",
nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",
total_amount="total_amount", pace_purchase="pace_purchase"),2)
"""
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",
nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",
total_amount="total_amount_corrected", pace_purchase="pace_purchase"),2)
# rename columns
mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)
print("Summary of forecast :\n")
print(X_test_expected_CA)
print("\n")
# save table
file_name = "table_expected_CA_"
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
X_test_expected_CA.to_csv(file_out, index = False)

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,825 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "aa74dbe0-f974-4b5c-94f4-4dba9fbc64fa",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "94c498e7-7c50-45f9-b3f4-a1ab19b7ccc4",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "7a3b50ac-b1ff-4f3d-9938-e048fdc8e027",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0b029d42-fb02-481e-a407-7e41886198a6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/1',\n",
" 'bdc2324-data/10',\n",
" 'bdc2324-data/101',\n",
" 'bdc2324-data/11',\n",
" 'bdc2324-data/12',\n",
" 'bdc2324-data/13',\n",
" 'bdc2324-data/14',\n",
" 'bdc2324-data/2',\n",
" 'bdc2324-data/3',\n",
" 'bdc2324-data/4',\n",
" 'bdc2324-data/5',\n",
" 'bdc2324-data/6',\n",
" 'bdc2324-data/7',\n",
" 'bdc2324-data/8',\n",
" 'bdc2324-data/9']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os\n",
"import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"BUCKET = \"bdc2324-data\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fbaf9aa7-ff70-4dbe-a969-b801c593510b",
"metadata": {},
"outputs": [],
"source": [
"# Chargement des fichiers campaign_stats.csv\n",
"FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1e0418bc-8e97-4a04-b7f3-bda3bef7d36e",
"metadata": {},
"outputs": [],
"source": [
"# Conversion des dates 'sent_at'\n",
"campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
"campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
"campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "cc5c20ba-e827-4e5a-97a5-7f3947e0621c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2023-11-09 18:10:45+00:00\n",
"2020-06-02 08:24:08+00:00\n",
"2023-10-12 01:39:48+00:00\n",
"2023-10-10 17:06:29+00:00\n",
"2023-11-01 09:20:48+00:00\n",
"2021-03-31 14:59:02+00:00\n"
]
}
],
"source": [
"# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
"print(campaign_stats_1['sent_at'].max())\n",
"print(campaign_stats_1['sent_at'].min())\n",
"\n",
"print(campaign_stats_2['sent_at'].max())\n",
"print(campaign_stats_2['sent_at'].min())\n",
"\n",
"print(campaign_stats_3['sent_at'].max())\n",
"print(campaign_stats_3['sent_at'].min())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c75632df-b018-4bb8-a99d-83f15af94369",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 2021-03-28 16:01:09+00:00\n",
"1 2021-03-28 16:01:09+00:00\n",
"2 2021-03-28 16:00:59+00:00\n",
"3 2021-03-28 16:00:59+00:00\n",
"4 2021-03-28 16:01:06+00:00\n",
" ... \n",
"6214803 2023-10-23 09:32:33+00:00\n",
"6214804 2023-10-23 09:32:49+00:00\n",
"6214805 2023-10-23 09:33:28+00:00\n",
"6214806 2023-10-23 09:31:53+00:00\n",
"6214807 2023-10-23 09:33:54+00:00\n",
"Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"campaign_stats_1['sent_at']"
]
},
{
"cell_type": "markdown",
"id": "f4c0c63e-0418-4cfe-a57d-7af57bca0c22",
"metadata": {},
"source": [
"### Customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d3bf880d-1065-4d5b-9954-1830aa5081af",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
]
}
],
"source": [
"FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
"\n",
"FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
"\n",
"with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7368f381-db8e-4a4d-9fe2-5947eb55be58",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
" 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
" 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
" 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
" 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
" 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
" 'average_purchase_delay', 'average_price_basket',\n",
" 'average_ticket_basket', 'total_price', 'preferred_category',\n",
" 'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
" 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
" 'tenant_id'],\n",
" dtype='object')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customers_plus_1.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08091935-b159-47fa-806c-e1444f3b227e",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_1.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f8c8868-c1ac-4cee-af08-533d928f6764",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_1['id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf95daf2-4852-4718-b474-207a1ebd8ac4",
"metadata": {},
"outputs": [],
"source": [
"customers_plus_2['id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1425c385-3216-4e4f-ae8f-a121624721ba",
"metadata": {},
"outputs": [],
"source": [
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "92533026-e27c-4f1f-81ca-64eda32a34c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
"# Exemple id commun = caractéristiques communes\n",
"print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
"\n",
"print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "bf9ebc94-0ba6-443d-8e53-22477a6e79a7",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"id 0.000000\n",
"lastname 43.461341\n",
"firstname 44.995588\n",
"birthdate 96.419870\n",
"email 8.622075\n",
"street_id 0.000000\n",
"created_at 0.000000\n",
"updated_at 0.000000\n",
"civility 100.000000\n",
"is_partner 0.000000\n",
"extra 100.000000\n",
"deleted_at 100.000000\n",
"reference 100.000000\n",
"gender 0.000000\n",
"is_email_true 0.000000\n",
"extra_field 100.000000\n",
"identifier 0.000000\n",
"opt_in 0.000000\n",
"structure_id 88.072380\n",
"note 99.403421\n",
"profession 95.913503\n",
"language 99.280945\n",
"mcp_contact_id 34.876141\n",
"need_reload 0.000000\n",
"last_buying_date 51.653431\n",
"max_price 51.653431\n",
"ticket_sum 0.000000\n",
"average_price 8.639195\n",
"fidelity 0.000000\n",
"average_purchase_delay 51.653431\n",
"average_price_basket 51.653431\n",
"average_ticket_basket 51.653431\n",
"total_price 43.014236\n",
"preferred_category 100.000000\n",
"preferred_supplier 100.000000\n",
"preferred_formula 100.000000\n",
"purchase_count 0.000000\n",
"first_buying_date 51.653431\n",
"last_visiting_date 100.000000\n",
"zipcode 71.176564\n",
"country 5.459418\n",
"age 96.419870\n",
"tenant_id 0.000000\n",
"dtype: float64\n"
]
}
],
"source": [
"pd.DataFrame(customers_plus_1.isna().mean()*100)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "6d62e73f-3925-490f-9fd4-d0e838903cb2",
"metadata": {},
"outputs": [],
"source": [
"# Chargement de toutes les données\n",
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
"\n",
"for nom_base in liste_base:\n",
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "12b24f1c-eb3e-45be-aaf3-b9273180caa3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>lastname</th>\n",
" <th>firstname</th>\n",
" <th>birthdate</th>\n",
" <th>email</th>\n",
" <th>street_id</th>\n",
" <th>created_at</th>\n",
" <th>updated_at</th>\n",
" <th>civility</th>\n",
" <th>is_partner</th>\n",
" <th>...</th>\n",
" <th>tenant_id</th>\n",
" <th>id_x</th>\n",
" <th>customer_id</th>\n",
" <th>purchase_date</th>\n",
" <th>type_of</th>\n",
" <th>is_from_subscription</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>start_date_time</th>\n",
" <th>event_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>405082</td>\n",
" <td>lastname405082</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>992423</td>\n",
" <td>405082</td>\n",
" <td>2023-01-11 17:08:41+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>13.0</td>\n",
" <td>False</td>\n",
" <td>2023-02-06 20:00:00+01:00</td>\n",
" <td>zaide</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>405082</td>\n",
" <td>lastname405082</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
" <td>2023-01-12 06:30:31.197484+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>992423</td>\n",
" <td>405082</td>\n",
" <td>2023-01-11 17:08:41+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>13.0</td>\n",
" <td>False</td>\n",
" <td>2023-02-06 20:00:00+01:00</td>\n",
" <td>zaide</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>411168</td>\n",
" <td>lastname411168</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1053934</td>\n",
" <td>411168</td>\n",
" <td>2023-03-16 16:23:10+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>62.0</td>\n",
" <td>False</td>\n",
" <td>2023-03-19 16:00:00+01:00</td>\n",
" <td>luisa miller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>411168</td>\n",
" <td>lastname411168</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>6</td>\n",
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
" <td>2023-03-17 06:30:35.431967+01:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1053934</td>\n",
" <td>411168</td>\n",
" <td>2023-03-16 16:23:10+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>62.0</td>\n",
" <td>False</td>\n",
" <td>2023-03-19 16:00:00+01:00</td>\n",
" <td>luisa miller</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4380</td>\n",
" <td>lastname4380</td>\n",
" <td>firstname4380</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>2021-04-22 14:51:55.432952+02:00</td>\n",
" <td>2022-04-14 11:41:33.738500+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1189141</td>\n",
" <td>4380</td>\n",
" <td>2020-11-26 13:12:53+01:00</td>\n",
" <td>3</td>\n",
" <td>False</td>\n",
" <td>51.3</td>\n",
" <td>False</td>\n",
" <td>2020-12-01 20:00:00+01:00</td>\n",
" <td>iphigenie en tauride</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318964</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1090839</td>\n",
" <td>19095</td>\n",
" <td>2019-05-19 21:18:36+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.5</td>\n",
" <td>False</td>\n",
" <td>2019-05-27 20:00:00+02:00</td>\n",
" <td>entre femmes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318965</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1090839</td>\n",
" <td>19095</td>\n",
" <td>2019-05-19 21:18:36+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.5</td>\n",
" <td>False</td>\n",
" <td>2019-05-27 20:00:00+02:00</td>\n",
" <td>entre femmes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318966</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1090839</td>\n",
" <td>19095</td>\n",
" <td>2019-05-19 21:18:36+02:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>4.5</td>\n",
" <td>False</td>\n",
" <td>2019-05-27 20:00:00+02:00</td>\n",
" <td>entre femmes</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318967</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1244277</td>\n",
" <td>19095</td>\n",
" <td>2019-12-31 11:04:07+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>5.5</td>\n",
" <td>False</td>\n",
" <td>2020-02-03 20:00:00+01:00</td>\n",
" <td>a boire et a manger</td>\n",
" </tr>\n",
" <tr>\n",
" <th>318968</th>\n",
" <td>19095</td>\n",
" <td>lastname19095</td>\n",
" <td>firstname19095</td>\n",
" <td>1979-07-16</td>\n",
" <td>email19095</td>\n",
" <td>6</td>\n",
" <td>2021-04-22 15:06:30.120537+02:00</td>\n",
" <td>2023-09-12 18:27:36.904104+02:00</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>...</td>\n",
" <td>1556</td>\n",
" <td>1244277</td>\n",
" <td>19095</td>\n",
" <td>2019-12-31 11:04:07+01:00</td>\n",
" <td>1</td>\n",
" <td>False</td>\n",
" <td>5.5</td>\n",
" <td>False</td>\n",
" <td>2020-02-03 20:00:00+01:00</td>\n",
" <td>a boire et a manger</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>318969 rows × 52 columns</p>\n",
"</div>"
],
"text/plain": [
" id lastname firstname birthdate email \\\n",
"0 405082 lastname405082 NaN NaN NaN \n",
"1 405082 lastname405082 NaN NaN NaN \n",
"2 411168 lastname411168 NaN NaN NaN \n",
"3 411168 lastname411168 NaN NaN NaN \n",
"4 4380 lastname4380 firstname4380 NaN NaN \n",
"... ... ... ... ... ... \n",
"318964 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"318965 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"318966 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"318967 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"318968 19095 lastname19095 firstname19095 1979-07-16 email19095 \n",
"\n",
" street_id created_at \\\n",
"0 6 2023-01-12 06:30:31.197484+01:00 \n",
"1 6 2023-01-12 06:30:31.197484+01:00 \n",
"2 6 2023-03-17 06:30:35.431967+01:00 \n",
"3 6 2023-03-17 06:30:35.431967+01:00 \n",
"4 1 2021-04-22 14:51:55.432952+02:00 \n",
"... ... ... \n",
"318964 6 2021-04-22 15:06:30.120537+02:00 \n",
"318965 6 2021-04-22 15:06:30.120537+02:00 \n",
"318966 6 2021-04-22 15:06:30.120537+02:00 \n",
"318967 6 2021-04-22 15:06:30.120537+02:00 \n",
"318968 6 2021-04-22 15:06:30.120537+02:00 \n",
"\n",
" updated_at civility is_partner ... \\\n",
"0 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
"1 2023-01-12 06:30:31.197484+01:00 NaN False ... \n",
"2 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
"3 2023-03-17 06:30:35.431967+01:00 NaN False ... \n",
"4 2022-04-14 11:41:33.738500+02:00 NaN False ... \n",
"... ... ... ... ... \n",
"318964 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"318965 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"318966 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"318967 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"318968 2023-09-12 18:27:36.904104+02:00 NaN False ... \n",
"\n",
" tenant_id id_x customer_id purchase_date type_of \\\n",
"0 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
"1 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n",
"2 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
"3 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n",
"4 1556 1189141 4380 2020-11-26 13:12:53+01:00 3 \n",
"... ... ... ... ... ... \n",
"318964 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
"318965 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
"318966 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n",
"318967 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
"318968 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n",
"\n",
" is_from_subscription amount is_full_price start_date_time \\\n",
"0 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
"1 False 13.0 False 2023-02-06 20:00:00+01:00 \n",
"2 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
"3 False 62.0 False 2023-03-19 16:00:00+01:00 \n",
"4 False 51.3 False 2020-12-01 20:00:00+01:00 \n",
"... ... ... ... ... \n",
"318964 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
"318965 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
"318966 False 4.5 False 2019-05-27 20:00:00+02:00 \n",
"318967 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
"318968 False 5.5 False 2020-02-03 20:00:00+01:00 \n",
"\n",
" event_name \n",
"0 zaide \n",
"1 zaide \n",
"2 luisa miller \n",
"3 luisa miller \n",
"4 iphigenie en tauride \n",
"... ... \n",
"318964 entre femmes \n",
"318965 entre femmes \n",
"318966 entre femmes \n",
"318967 a boire et a manger \n",
"318968 a boire et a manger \n",
"\n",
"[318969 rows x 52 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Jointure\n",
"merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
"merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
"merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
"merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
"merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
"df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
"df_customer_event"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

69
README.md Normal file
View File

@ -0,0 +1,69 @@
# Business data challenge 2023-2024 | ENSAE Paris
# Arenametrix : customer segmentation
<p align="center">
<img src="https://dev.arenametrix.fr/assets/logo_ax-806e8204f49bcc2c5e8cd34e9748d16a6038404e37fdb2dc9d61455bb06c6461.png" width=300>
</p>
## Team 1
* Antoine JOUBREL
* Alexis REVELLE
* Fanta RODRIGUE
* Thomas PIQUÉ
## Coaches
* Elia LAPENTA
* Michael VISSER
## Support team
* Patrice MICHEL (Datastorm)
* Hassan MAISSORO (Datastorm)
* Alexandre PRINC (Arenametrix)
## Microeconomics coordinator
* Yuanzhe TANG
### Description of the problematic
The goal of this project is to create segments of customers from 15 companies belonging to 3 different types of activities (sports companies, museum, and music companies).
### More detailled instructions provided by Arenamtrix
- Definition of “marketing personae” that can be match with a probability to buy a future event
- Matching between future event and people in the database (with for instance a probability to buy a future event)
- And thus, a forecast of the quantity of ticket sold by event by “marketing personae” or by a segment of the database
- BONUS : What is the best timing to send a communication to each contact in the database and each “marketing personae”
- BONUS : What should we tell to each contact in the database and each “marketing personae”to make them come back
### Our approach
We opted for a sector-based approach, which means that 3 segmentations have been performed (one for each type of activity).
As the segments have to be linked to a probability of future purchase, we directly used the probability of purchase during the incoming year to make segments. The first step of the modelization is a pipeline that fits 3 ML models (naive bayes, random forest, and logistic regression) on the data to predict whether the customer will purchase during the year. We then use the probability of purchase estimated to split the customers into 4 segments. For each segment, we can estimate the potential number of tickets and revenue for the incoming year.
### How run the code
Codes have to be run in an order following their numbers. Each of them is described below :
- `1_Input_cleaning.py` \
Clean raw data and generate dataframes that will be used to build datasets with insightful variables. Datasets are exported to location 0_Input/.
- `2_Datasets_generation.py` \
Use dataframes previously created and aggregate them to create test and train set for each company. Databases are exported to location 1_Temp/1_0_Modelling_Datasets/ in a folder containing all 5 databases for a type of activity.
- `3_Modelling_datasets.py` \
For each type of activity, the test and train sets of the 5 tenants are concatenated. Databases are exported to location 1_Temp/1_0_Modelling_Datasets/.
- `4_Descriptive_statistics.py` \
Generate graphics providing some descriptive statistics about the data at the activity level. All graphics are exported to location 2_Output/2_0_Descriptive_Statistics/.
- `5_Modelling.py` \
3 ML models will be fitted on the data, and results will be exported for all 3 types of activities. \
3 pipelines are built, one by type of model (Naive Bayes, Random Forest, Logistic Regression). For the 2 latter ML methods, cross validation was performed to ensure generalization. Graphics displaying the quality of the training are provided. Optimal parameters found are saved in a pickle file (which will be used in the 6th step to add propensity scores to the test set and then determine the segments of the customers). All these files are exported to location 2_Output/2_1_Modeling_results/
- `6_Segmentation_and_Marketing_Personae.py` \
The test set will be fitted with the optimal parameters computed previously, and a propensity score (probability of a future purchase) will be assigned to each customer of this dataset. Segmentation is performed according to the scores provided. Graphics describing the marketing personae associated to the segments as well as their business value are exported to location 2_Output/2_2_Segmentation_and_Marketing_Personae/.
- `7_Sales_Forecast.py` \
To ensure a decent recall, and because of the unbalancing of the target variable y (the global probability of purchase is between 4 and 14 %), the probabilities of purchasing are overestimated.The scores will therefore be adjusted so that their mean approximates the overall probability of a purchase. This score adjusted is used to estimate, for each customer, the number of tickets sold and the revenue generated during the incoming year. Results are aggregated at segment level. A histogram displaying the adjusted propensity scores and 2 tables summarizing the forecast outcome are exported to location 2_Output/2_3_Sales_Forecast/.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

249
all_packages_versions.txt Normal file
View File

@ -0,0 +1,249 @@
Package Version
------------------------- ---------------
aiohttp 3.9.1
aiosignal 1.3.1
alembic 1.13.1
anyio 4.2.0
archspec 0.2.2
argon2-cffi 23.1.0
argon2-cffi-bindings 21.2.0
arrow 1.3.0
astroid 3.0.2
asttokens 2.4.1
async-lru 2.0.4
attrs 23.2.0
Babel 2.14.0
bcrypt 4.1.2
beautifulsoup4 4.12.3
bleach 6.1.0
blinker 1.7.0
bokeh 3.3.4
boltons 23.1.1
boto3 1.34.29
botocore 1.34.29
branca 0.7.0
Brotli 1.1.0
cached-property 1.5.2
cachetools 5.3.2
certifi 2023.11.17
cffi 1.16.0
charset-normalizer 3.3.2
click 8.1.7
click-plugins 1.1.1
cligj 0.7.2
cloudpickle 3.0.0
colorama 0.4.6
comm 0.2.1
conda 23.11.0
conda-libmamba-solver 23.12.0
conda-package-handling 2.2.0
conda_package_streaming 0.9.0
configparser 5.3.0
contourpy 1.2.0
cryptography 41.0.7
cycler 0.12.1
cytoolz 0.12.2
dask 2024.1.1
databricks-cli 0.18.0
debugpy 1.8.0
decorator 5.1.1
defusedxml 0.7.1
dill 0.3.8
distributed 2024.1.1
distro 1.8.0
docker 7.0.0
duckdb 0.9.2
entrypoints 0.4
exceptiongroup 1.2.0
executing 2.0.1
fastjsonschema 2.19.1
fiona 1.9.5
flake8 7.0.0
Flask 3.0.1
folium 0.15.1
fonttools 4.47.2
fqdn 1.5.1
frozenlist 1.4.1
fsspec 2023.12.2
GDAL 3.8.3
gensim 4.3.2
geopandas 0.14.2
gitdb 4.0.11
GitPython 3.1.41
google-auth 2.27.0
greenlet 3.0.3
gunicorn 21.2.0
hvac 2.1.0
idna 3.6
importlib-metadata 7.0.1
importlib-resources 6.1.1
ipykernel 6.29.0
ipython 8.20.0
ipywidgets 8.1.1
isoduration 20.11.0
isort 5.13.2
itsdangerous 2.1.2
jedi 0.19.1
Jinja2 3.1.3
jmespath 1.0.1
joblib 1.3.2
json5 0.9.14
jsonpatch 1.33
jsonpointer 2.4
jsonschema 4.21.1
jsonschema-specifications 2023.12.1
jupyter-cache 1.0.0
jupyter_client 8.6.0
jupyter_core 5.7.1
jupyter-events 0.9.0
jupyter-lsp 2.2.2
jupyter_server 2.12.5
jupyter-server-mathjax 0.2.6
jupyter_server_terminals 0.5.2
jupyterlab 4.0.11
jupyterlab_git 0.50.0
jupyterlab_pygments 0.3.0
jupyterlab_server 2.25.2
jupyterlab-widgets 3.0.9
kiwisolver 1.4.5
kubernetes 29.0.0
libmambapy 1.5.5
llvmlite 0.41.1
locket 1.0.0
lz4 4.3.3
Mako 1.3.1
mamba 1.5.5
mapclassify 2.6.1
Markdown 3.5.2
MarkupSafe 2.1.4
matplotlib 3.8.2
matplotlib-inline 0.1.6
mccabe 0.7.0
menuinst 2.0.2
mistune 3.0.2
mlflow 2.10.0
msgpack 1.0.7
multidict 6.0.4
munkres 1.1.4
mypy 1.8.0
mypy-extensions 1.0.0
nbclient 0.8.0
nbconvert 7.14.2
nbdime 4.0.1
nbformat 5.9.2
nest_asyncio 1.6.0
networkx 3.2.1
nltk 3.8.1
notebook_shim 0.2.3
numba 0.58.1
numpy 1.26.3
oauthlib 3.2.2
opencv-python-headless 4.9.0.80
overrides 7.7.0
packaging 23.2
pandas 2.2.0
pandocfilters 1.5.0
paramiko 3.4.0
parso 0.8.3
partd 1.4.1
patsy 0.5.6
pexpect 4.9.0
pickleshare 0.7.5
pillow 10.2.0
pip 23.3.2
pkgutil_resolve_name 1.3.10
platformdirs 4.1.0
plotly 5.18.0
pluggy 1.3.0
polars 0.20.6
prometheus-client 0.19.0
prometheus-flask-exporter 0.23.0
prompt-toolkit 3.0.42
protobuf 4.24.4
psutil 5.9.8
ptyprocess 0.7.0
pure-eval 0.2.2
pyarrow 14.0.2
pyarrow-hotfix 0.6
pyasn1 0.5.1
pyasn1-modules 0.3.0
pycodestyle 2.11.1
pycosat 0.6.6
pycparser 2.21
pyflakes 3.2.0
Pygments 2.17.2
PyJWT 2.8.0
pylint 3.0.3
PyNaCl 1.5.0
pyOpenSSL 23.3.0
pyparsing 3.1.1
pyproj 3.6.1
PySocks 1.7.1
python-dateutil 2.8.2
python-json-logger 2.0.7
pytz 2023.3.post1
pyu2f 0.1.5
PyYAML 6.0.1
pyzmq 25.1.2
querystring-parser 1.2.4
referencing 0.32.1
regex 2023.12.25
requests 2.31.0
requests-oauthlib 1.3.1
rfc3339-validator 0.1.4
rfc3986-validator 0.1.1
rpds-py 0.17.1
rsa 4.9
Rtree 1.2.0
ruamel.yaml 0.18.5
ruamel.yaml.clib 0.2.7
s3fs 0.4.2
s3transfer 0.10.0
scikit-learn 1.4.0
scipy 1.12.0
seaborn 0.13.2
Send2Trash 1.8.2
setuptools 68.2.2
shapely 2.0.2
six 1.16.0
smart-open 6.4.0
smmap 5.0.0
sniffio 1.3.0
sortedcontainers 2.4.0
soupsieve 2.5
SQLAlchemy 2.0.25
sqlparse 0.4.4
stack-data 0.6.2
statsmodels 0.14.1
tabulate 0.9.0
tblib 3.0.0
tenacity 8.2.3
terminado 0.18.0
threadpoolctl 3.2.0
tinycss2 1.2.1
tomli 2.0.1
tomlkit 0.12.3
toolz 0.12.1
tornado 6.3.3
tqdm 4.66.1
traitlets 5.14.1
truststore 0.8.0
types-python-dateutil 2.8.19.20240106
typing_extensions 4.9.0
typing-utils 0.1.0
tzdata 2023.4
uri-template 1.3.0
urllib3 1.26.18
wcwidth 0.2.13
webcolors 1.13
webencodings 0.5.1
websocket-client 1.7.0
Werkzeug 3.0.1
wheel 0.42.0
widgetsnbextension 4.0.9
xgboost 2.0.3
xyzservices 2023.10.1
yarl 1.9.4
zict 3.0.0
zipp 3.17.0
zstandard 0.22.0

File diff suppressed because it is too large Load Diff

View File

@ -74,53 +74,11 @@ def preprocessing_customerplus(directory_path):
cleaning_date(customerplus_copy, 'last_visiting_date')
# Selection des variables
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload'], axis = 1, inplace=True) # 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at', 'last_buying_date', 'max_price', 'ticket_sum', 'average_price', 'average_purchase_delay' , 'average_price_basket', 'average_ticket_basket', 'total_price', 'purchase_count', 'first_buying_date', 'fidelity'
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
return customerplus_copy
def preprocessing_tickets_area(directory_path):
# Datasets loading
tickets = load_dataset(directory_path, name = "tickets")
purchases = load_dataset(directory_path, name = "purchases")
suppliers = load_dataset(directory_path, name = "suppliers")
type_ofs = load_dataset(directory_path, name = "type_ofs")
# Base des tickets
tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
# Base des fournisseurs
suppliers = suppliers[['id', 'name']]
suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)
suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
# Base des types de billets
type_ofs = type_ofs[['id', 'name', 'children']]
type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
# Base des achats
# Nettoyage de la date d'achat
# cleaning_date(purchases, 'purchase_date')
# Selection des variables
purchases = purchases[['id', 'purchase_date', 'customer_id']]
# Fusions
# Fusion avec fournisseurs
ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
# Fusion avec type de tickets
ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
# Fusion avec achats
ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
ticket_information.drop(['id'], axis = 1, inplace=True)
return ticket_information
def preprocessing_target_area(directory_path):
# Datasets loading
@ -169,6 +127,69 @@ def preprocessing_campaigns_area(directory_path):
return campaigns_full
def preprocessing_tickets_area(directory_path):
# Datasets loading
tickets = load_dataset(directory_path, name = "tickets")
# Supplementary tickets dataset for tenant 101
if directory_path == '101':
tickets_1 = load_dataset(directory_path, name = "tickets_1")
purchases = load_dataset(directory_path, name = "purchases")
suppliers = load_dataset(directory_path, name = "suppliers")
# type_ofs = load_dataset(directory_path, name = "type_ofs")
# Base des tickets
tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)
if directory_path == '101':
tickets_1 = tickets_1[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
tickets_1.rename(columns = {'id' : 'ticket_id'}, inplace = True)
# Base des fournisseurs
suppliers = suppliers[['id', 'name']]
suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)
suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')
# Base des types de billets
# type_ofs = type_ofs[['id', 'name', 'children']]
# type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)
# Base des achats
# Nettoyage de la date d'achat
# cleaning_date(purchases, 'purchase_date')
# Selection des variables
purchases = purchases[['id', 'purchase_date', 'customer_id']]
# Fusions
# Fusion avec fournisseurs
ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
# Fusion avec type de tickets
# ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
# ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
# Fusion avec achats
ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
ticket_information.drop(['id'], axis = 1, inplace=True)
if directory_path == '101':
# Fusion avec fournisseurs
ticket_information_1 = pd.merge(tickets_1, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
ticket_information_1.drop(['supplier_id', 'id'], axis = 1, inplace=True)
# Fusion avec achats
ticket_information_1 = pd.merge(ticket_information_1, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
ticket_information_1.drop(['id'], axis = 1, inplace=True)
return ticket_information, ticket_information_1
else :
return ticket_information
def create_products_table(directory_path):
# first merge products and categories
print("first merge products and categories")
@ -179,23 +200,21 @@ def create_products_table(directory_path):
categories = categories.drop(columns = ['extra_field', 'quota'])
#Merge
products_theme = products.merge(categories, how = 'left', left_on = 'category_id',
right_on = 'id', suffixes=('_products', '_categories'))
products_theme = products.merge(categories, how = 'left', left_on = 'category_id', right_on = 'id', suffixes=('_products', '_categories'))
products_theme = products_theme.rename(columns = {"name" : "name_categories"})
# Second merge products_theme and type of categories
print("Second merge products_theme and type of categories")
type_of_categories = load_dataset(directory_path, name = "type_of_categories")
type_of_categories = type_of_categories.drop(columns = 'id')
products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
right_on = 'category_id' )
# print("Second merge products_theme and type of categories")
# type_of_categories = load_dataset(directory_path, name = "type_of_categories")
# type_of_categories = type_of_categories.drop(columns = 'id')
# products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
# right_on = 'category_id' )
# Index cleaning
products_theme = products_theme.drop(columns = ['id_categories'])
products_theme = order_columns_id(products_theme)
return products_theme
def create_events_table(directory_path):
# first merge events and seasons :
print("first merge events and seasons : ")
@ -233,16 +252,12 @@ def create_events_table(directory_path):
def create_representations_table(directory_path):
representations = load_dataset(directory_path, name = "representations")
representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
'representation_type_id'])
representations = representations.drop(columns = ['serial', 'satisfaction', 'is_display', 'expected_filling', 'max_filling', 'extra_field', 'name', 'representation_type_id']) # 'start_date_time', 'end_date_time', 'open'
representations_capacity = load_dataset(directory_path, name = "representation_category_capacities")
representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])
representations_theme = representations.merge(representations_capacity, how='left',
left_on='id', right_on='representation_id',
suffixes=('_representation', '_representation_cap'))
representations_theme = representations.merge(representations_capacity, how='left', left_on='id', right_on='representation_id', suffixes=('_representation', '_representation_cap'))
# index cleaning
representations_theme = representations_theme.drop(columns = ["id_representation"])
representations_theme = order_columns_id(representations_theme)
@ -255,20 +270,29 @@ def uniform_product_df(directory_path):
products_theme = create_products_table(directory_path)
representation_theme = create_representations_table(directory_path)
events_theme = create_events_table(directory_path)
ticket_information = preprocessing_tickets_area(directory_path)
if directory_path == '101':
ticket_information, ticket_information_1 = preprocessing_tickets_area(directory_path)
else :
ticket_information = preprocessing_tickets_area(directory_path)
print("Products theme columns : ", products_theme.columns)
print("\n Representation theme columns : ", representation_theme.columns)
print("\n Events theme columns : ", events_theme.columns)
products_global = pd.merge(products_theme, representation_theme, how='left',
on= ["representation_id", "category_id"])
products_global = pd.merge(products_theme, representation_theme, how='left', on= ["representation_id", "category_id"])
products_global = pd.merge(products_global, events_theme, how='left', on='event_id',
suffixes = ("_representation", "_event"))
products_global = pd.merge(products_global, events_theme, how='left', on='event_id', suffixes = ("_representation", "_event"))
products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]
products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children',
return products_purchased_reduced
if directory_path == '101':
products_purchased_1 = pd.merge(ticket_information_1, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')
products_purchased_reduced_1 = products_purchased_1[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'amount', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons', 'start_date_time', 'end_date_time', 'open']] # 'type_of_ticket_name', 'children',
return products_purchased_reduced, products_purchased_reduced_1
else :
return products_purchased_reduced

View File

@ -0,0 +1,165 @@
# Function de construction de KPI
def custom_date_parser(date_string):
return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
def display_input_databases(directory_path, file_name, datetime_col = None):
"""
This function returns the file from s3 storage
"""
file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv"
print("File path : ", file_path)
with fs.open(file_path, mode="rb") as file_in:
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
return df
def campaigns_kpi_function(campaigns_information = None, max_date = "2023-12-01"):
# Nombre de campagnes de mails
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
# Temps d'ouverture moyen (en minutes)
campaigns_information['time_to_open'] = ((pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
# Nombre de mail ouvert
opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]
opened_campaign.dropna(subset=['opened_at'], inplace=True)
opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)
# Fusion des indicateurs
campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')
campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')
# Taux de mails ouvert
campaigns_reduced['taux_ouverture_mail'] = campaigns_reduced['nb_campaigns_opened'] / campaigns_reduced['nb_campaigns']
# Fill NaN values
campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
# Remplir les NaT : time_to_open (??)
return campaigns_reduced
def tickets_kpi_function(tickets_information = None):
tickets_information_copy = tickets_information.copy()
# Dummy : Canal de vente en ligne
liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)
# Proportion de vente en ligne
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['purchase_id'].nunique().reset_index()
prop_vente_internet.rename(columns = {'purchase_id' : 'nb_purchases_internet'}, inplace = True)
# Mixte KPI comportement achat
tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
.groupby(['customer_id'])
.agg(nb_tickets=('ticket_id', 'nunique'),
nb_purchases=('purchase_id', 'nunique'),
total_amount=('amount', 'sum'),
nb_suppliers=('supplier_name', 'nunique'),
achat_internet=('vente_internet', 'max'),
purchase_date_min=('purchase_date', 'min'),
purchase_date_max=('purchase_date', 'max'))
.reset_index())
tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
# Convertir date et en chiffre
max_date = tickets_kpi['purchase_date_max'].max()
tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')
tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
# Proportion de ticket internet
tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
tickets_kpi['nb_purchases_internet'] = tickets_kpi['nb_purchases_internet'].fillna(0)
tickets_kpi['prop_purchases_internet'] = tickets_kpi['nb_purchases_internet'] / tickets_kpi['nb_purchases']
# Nombre d'achat à choisir
tickets_information_copy['month_year_purchase'] = 'purchases_' + tickets_information_copy['purchase_date'].dt.month.astype(str) + '_' + tickets_information_copy['purchase_date'].dt.year.astype(str)
purchases_by_month = tickets_information_copy.pivot_table(index='customer_id', columns='month_year_purchase', values='purchase_id', aggfunc='nunique', fill_value=0)
tickets_kpi = pd.merge(tickets_kpi, purchases_by_month, on = 'customer_id', how = 'left')
return tickets_kpi
def customerplus_kpi_function(customerplus_clean = None):
# KPI sur les données socio-demographique
# Le genre
customerplus_clean["gender_label"] = customerplus_clean["gender"].map({
0: 'female',
1: 'male',
2: 'other'
})
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
customerplus_clean.drop(columns = "gender", inplace = True)
# Age
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
customerplus_clean['categorie_age_10_20'] = ((customerplus_clean['age'] >= 10) & (customerplus_clean['age'] < 20)).astype(int)
customerplus_clean['categorie_age_20_30'] = ((customerplus_clean['age'] >= 20) & (customerplus_clean['age'] < 30)).astype(int)
customerplus_clean['categorie_age_30_40'] = ((customerplus_clean['age'] >= 30) & (customerplus_clean['age'] < 40)).astype(int)
customerplus_clean['categorie_age_40_50'] = ((customerplus_clean['age'] >= 40) & (customerplus_clean['age'] < 50)).astype(int)
customerplus_clean['categorie_age_50_60'] = ((customerplus_clean['age'] >= 50) & (customerplus_clean['age'] < 60)).astype(int)
customerplus_clean['categorie_age_60_70'] = ((customerplus_clean['age'] >= 60) & (customerplus_clean['age'] < 70)).astype(int)
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
# customerplus_clean.drop(columns = "age", inplace = True)
# Consentement au mailing
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
# Indicatrice si individue vit en France
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
# customerplus_clean.drop(columns = "country", inplace = True)
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
# customerplus_clean.drop(columns = "profession", inplace = True)
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
# customerplus_clean.drop(columns = "zipcode", inplace = True)
return customerplus_clean
def targets_KPI(df_target = None):
df_target['target_name'] = df_target['target_name'].fillna('').str.lower()
# Target name cotegory musees /
df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)
df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)
df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)
df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)
df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)
df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)
df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)
# Target name category for sport compagnies
df_target['target_abonne'] = ((
df_target['target_name']
.str.contains('|'.join(['abo', 'adh']), case=False)
& ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)
).astype(int))
df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()
target_agg = df_target.groupby('customer_id').agg(
nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes
# all_targets=('target_name', concatenate_names),
# all_target_types=('target_type_name', concatenate_names)
).reset_index()
target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')
return target_agg

425
utils_ml.py Normal file
View File

@ -0,0 +1,425 @@
import pandas as pd
import numpy as np
import os
import s3fs
import re
import io
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
import pickle
import warnings
def load_train_test(type_of_activity, type_of_model):
BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
File_path_train = BUCKET + "/Train_set.csv"
File_path_test = BUCKET + "/Test_set.csv"
with fs.open( File_path_train, mode="rb") as file_in:
dataset_train = pd.read_csv(file_in, sep=",")
# dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
with fs.open(File_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
# dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
if type_of_model=='premium':
dataset_train['company'] = dataset_train['customer_id'].apply(lambda x: x.split('_')[0])
dataset_test['company'] = dataset_test['customer_id'].apply(lambda x: x.split('_')[0])
dataset_train = dataset_train[dataset_train['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
dataset_test = dataset_test[dataset_test['company'].isin(['1', '3', '4', '5', '6', '7', '8', '10', '11', '13'])]
return dataset_train, dataset_test
def save_file_s3(File_name, type_of_activity, type_of_model, model):
"""
save plot into s3 storage
"""
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/"
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
"""
save result into s3 storage
"""
if model_path:
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
else:
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/" + File_name + '.csv'
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
result_set.to_csv(file_out, index = False)
def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
"""
save model into pickle file
"""
model_bytes = pickle.dumps(classifier)
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
f.write(model_bytes)
def compute_recall(group):
return recall_score(group['y_has_purchased'], group['prediction'])
def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
test = dataset_test.copy()
test['prediction'] = y_pred
test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True)
def features_target_split(dataset_train, dataset_test):
"""
return train and test set
"""
features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',
'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',
'target_jeune', 'target_abonne']
X_train = dataset_train[features_l]
y_train = dataset_train[['y_has_purchased']]
X_test = dataset_test[features_l]
y_test = dataset_test[['y_has_purchased']]
return X_train, X_test, y_train, y_test
def preprocess(type_of_model, type_of_activity):
"""
preprocess variables before running machine learning pipeline
"""
numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']
binary_features = ['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in']
if type_of_activity=='musee':
numeric_features.remove('time_to_open')
if type_of_model=='premium':
if type_of_activity=='musique':
binary_features.extend(['target_optin', 'target_newsletter'])
elif type_of_activity=='sport':
binary_features.extend(['target_jeune', 'target_entreprise', 'target_abonne'])
else:
binary_features.extend([ 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter'])
numeric_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value=0)),
("scaler", StandardScaler())
])
binary_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
])
preproc = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("bin", binary_transformer, binary_features)
]
)
return preproc
def draw_confusion_matrix(y_test, y_pred, model):
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'], annot_kws={"size": 14})
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model)
def draw_roc_curve(X_test, y_pred_prob, model):
# Calcul des taux de faux positifs (FPR) et de vrais positifs (TPR)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label=1)
# Calcul de l'aire sous la courbe ROC (AUC)
roc_auc = auc(fpr, tpr)
plt.figure(figsize = (14, 8))
plt.plot(fpr, tpr, label="ROC curve(area = %0.3f)" % roc_auc)
plt.plot([0, 1], [0, 1], color="red",label="Random Baseline", linestyle="--")
plt.grid(color='gray', linestyle='--', linewidth=0.5)
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
plt.title("ROC Curve", size=18)
plt.legend(loc="lower right", fontsize=14)
plt.show()
save_file_s3("Roc_curve_", type_of_activity, type_of_model, model)
def draw_calibration_curve(X_test, y_pred_prob, model):
frac_pos, mean_pred = calibration_curve(y_test, y_pred_prob, n_bins=10)
# Plot the calibration curve
plt.plot(mean_pred, frac_pos, 's-', label=model)
plt.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
plt.xlabel('Mean predicted value')
plt.ylabel('Fraction of positive predictions')
plt.title("Calibration Curve")
plt.legend()
plt.show()
save_file_s3("Calib_curve_", type_of_activity, type_of_model, model)
def draw_features_importance(pipeline, model, randomF = False):
if randomF:
coefficients = pipeline.named_steps[model].feature_importances_
else:
coefficients = pipeline.named_steps[model].coef_[0]
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
# Tracer l'importance des caractéristiques
plt.figure(figsize=(12, 8))
plt.barh(feature_names, coefficients, color='skyblue')
plt.xlabel("Features' Importance")
plt.ylabel('Caractéristiques')
plt.title("Features' Importance")
plt.grid(True)
plt.show()
save_file_s3("Features_", type_of_activity, type_of_model, model)
def draw_prob_distribution(y_pred_prob, model):
plt.figure(figsize=(10, 8))
plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
plt.xlim(0, 1)
plt.ylim(0, None)
plt.title('Histogramme des probabilités pour la classe 1')
plt.xlabel('Probability')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
save_file_s3("prob_dist_", type_of_activity, type_of_model, model)
def draw_prob_distribution_companies(y_pred_prob, model):
test = dataset_test.copy()
test['probability to buy'] = y_pred_prob
test['company'] = test['customer_id'].str.split('_', expand=True)[0]
sns.histplot(data=test, x='probability to buy', hue='company', element='step',
stat='count', common_norm=False, bins=10, palette='Set1', alpha=1)
plt.xlim(0, 1)
plt.ylim(0, None)
plt.title('Histogram of probabilities for class 1 by company')
plt.xlabel('Probability')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model)
def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('LogisticRegression_Benchmark', LogisticRegression(solver='saga', class_weight = weight_dict,
max_iter=5000, n_jobs=-1))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "LogisticRegression_Benchmark"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_features_importance(pipeline, 'LogisticRegression_Benchmark')
draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline)
return model_result
def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
y_train = y_train['y_has_purchased']
param_grid = {'LogisticRegression_cv__C': np.logspace(-10, 6, 17, base=2),
'LogisticRegression_cv__penalty': ['l1', 'l2'],
'LogisticRegression_cv__class_weight': ['balanced', weight_dict]}
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('LogisticRegression_cv', LogisticRegression(solver='saga', max_iter=5000))
])
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
n_jobs=-1)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
best_pipeline = grid_search.best_estimator_
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "LogisticRegression_cv"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_features_importance(best_pipeline, 'LogisticRegression_cv')
draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search)
return model_result
def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('randomF', RandomForestClassifier(class_weight = weight_dict,
n_jobs=-1))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "randomF"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_features_importance(pipeline, 'randomF', randomF=True)
draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline)
return model_result
def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
y_train = y_train['y_has_purchased']
param_grid = {
'randomF_cv__n_estimators': [100, 300],
'randomF_cv__max_features': ['sqrt', 'log2'],
'randomF_cv__min_samples_split': [2, 10],
'randomF_cv__min_samples_leaf': [1, 4],
'randomF_cv__class_weight': [weight_dict]
}
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('randomF_cv', RandomForestClassifier(n_jobs=-1))
])
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=make_scorer(recall_score), error_score='raise',
n_jobs=-1)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)
y_pred_prob = grid_search.predict_proba(X_test)[:, 1]
best_pipeline = grid_search.best_estimator_
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "randomF_cv"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_features_importance(best_pipeline, 'randomF_cv', randomF=True)
draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search)
return model_result
def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result):
unique_classes, counts = np.unique(y_train, return_counts=True)
class_priors = counts / counts.sum()
pipeline = Pipeline(steps=[
('preprocessor', preproc),
('Naive_Bayes', GaussianNB(priors=class_priors))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob, pos_label = 1)
model = "Naive_Bayes"
result = pd.DataFrame({"Model" : [model],
"Accuracy" : [accuracy_score(y_test, y_pred)],
"Recall" : [recall_score(y_test, y_pred)],
"F1_score" : [f1_score(y_test, y_pred, average="macro")],
"AUC" : [auc(fpr, tpr)]}
)
model_result = pd.concat([model_result, result])
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
draw_confusion_matrix(y_test, y_pred, model)
draw_roc_curve(X_test, y_pred_prob, model)
draw_prob_distribution(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline)
return model_result

325
utils_sales_forecast.py Normal file
View File

@ -0,0 +1,325 @@
# importations
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
import s3fs
import matplotlib.pyplot as plt
from scipy.optimize import fsolve
import pickle
import warnings
import io
# functions
def load_train_test(type_of_activity):
"""
Loads the training and test datasets from S3 storage for the type of activity specified.
Args:
- type_of_activity (str)
Returns:
DataFrame: Training dataset.
DataFrame: Test dataset.
"""
# BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
File_path_train = BUCKET + "/Train_set.csv"
File_path_test = BUCKET + "/Test_set.csv"
with fs.open( File_path_train, mode="rb") as file_in:
dataset_train = pd.read_csv(file_in, sep=",")
# dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
with fs.open(File_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
# dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
return dataset_train, dataset_test
def features_target_split(dataset_train, dataset_test):
"""
Splits the dataset into features and target variables for training and testing.
Args:
- dataset_train (DataFrame): Training dataset.
- dataset_test (DataFrame): Test dataset.
Returns:
DataFrame: Features of the training dataset.
DataFrame: Features of the test dataset.
DataFrame: Target variable of the training dataset.
DataFrame: Target variable of the test dataset.
"""
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'fidelity', 'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
X_train = dataset_train # [features_l]
y_train = dataset_train[['y_has_purchased']]
X_test = dataset_test # [features_l]
y_test = dataset_test[['y_has_purchased']]
return X_train, X_test, y_train, y_test
def load_model(type_of_activity, model):
"""
Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
Args:
- type_of_activity (str)
- model (str)
Returns:
Model: machine learning model pre-trained with a scikit learn pipeline.
"""
# BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
filename = model + '.pkl'
file_path = BUCKET + filename
with fs.open(file_path, mode="rb") as f:
model_bytes = f.read()
model = pickle.loads(model_bytes)
return model
def df_segment(df, y, model) :
"""
Segments customers into 4 groups based on the propensity scores given by a previously-loaded ML model.
Args:
- df (DataFrame): DataFrame to be segmented.
- y (Series): True target variable.
- model (Model): Pre-trained machine learning model for prediction.
Returns:
DataFrame: Segmented DataFrame with predicted values and true values for y.
"""
y_pred = model.predict(df)
y_pred_prob = model.predict_proba(df)[:, 1]
df_segment = df
df_segment["has_purchased"] = y
df_segment["has_purchased_estim"] = y_pred
df_segment["score"] = y_pred_prob
df_segment["quartile"] = np.where(df_segment['score']<0.25, '1',
np.where(df_segment['score']<0.5, '2',
np.where(df_segment['score']<0.75, '3', '4')))
return df_segment
def odd_ratio(score) :
"""
Args:
- score (Union[float, int])
Returns:
float: Odd ratio value.
"""
return score / (1 - score)
def adjust_score_1(score) :
"""
Adjust scores by replacing ones with the second highest value.
Allows to compute odd ratios then.
Args:
- score (List[Union[float, int]])
Returns:
np.ndarray: Adjusted score values.
"""
second_best_score = np.array([element for element in score if element !=1]).max()
new_score = np.array([element if element!=1 else second_best_score for element in score])
return new_score
def adjusted_score(odd_ratio, bias) :
"""
Adjust the score based on the odd ratio and bias.
Args:
- odd_ratio (Union[float, int])
- bias (Union[float, int])
Returns:
float: Adjusted score value.
"""
adjusted_score = odd_ratio/(bias+odd_ratio)
return adjusted_score
def find_bias(odd_ratios, y_objective, initial_guess=10) :
"""
Find the bias needed to adjust scores so that their sum is equal to the total number of purchases observed.
Args:
- odd_ratios (List[float]): List of odd ratios associated to the scores that have be adjusted.
- y_objective (Union[float, int]): Objective value => total number of purchases.
- initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 10 (bias is approximately 6 for sports, 10 for music and 22 for museums)
Returns:
float: Estimated bias value.
"""
bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess)
return bias_estimated[0]
def plot_hist_scores(df, score, score_adjusted, type_of_activity) :
"""
Plot a histogram comparing scores and adjusted scores.
Args:
- df (DataFrame): DataFrame containing score data.
- score (str): Name of the column in df representing the original scores.
- score_adjusted (str): Name of the column in df representing the adjusted scores.
- type_of_activity (str) : type of activity of the companies considered.
Returns:
None
"""
plt.figure()
plt.hist(df[score], label = "score", alpha=0.6)
plt.hist(df[score_adjusted], label="adjusted score", alpha=0.6)
plt.legend()
plt.xlabel("probability of a future purchase")
plt.ylabel("count")
plt.title(f"Comparison between score and adjusted score for {type_of_activity} companies")
# plt.show()
def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) :
"""
Project tickets sold and total amount based on the adjusted scores and the duration of periods of study / projection.
Args:
- df (DataFrame): DataFrame containing information about past sales.
- nb_purchases (str) : Name of the column in df representing the number of purchases.
- nb_tickets (str): Name of the column in df representing the number of tickets.
- total_amount (str): Name of the column in df representing the total amount.
- score_adjusted (str): Name of the column in df representing the adjusted score.
- duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
- duration_projection (int or float): Duration of the period of projection of sales / revenue.
Returns:
DataFrame: DataFrame completed with sales and total amount projections.
"""
duration_ratio = duration_ref/duration_projection
df_output = df
# project number of tickets : at least 1 ticket purchased if the customer purchased
df_output.loc[:,"nb_tickets_projected"] = df_output.loc[:,nb_tickets].apply(lambda x : max(1, x /duration_ratio))
# project amount : if the customer buys a ticket, we expect the amount to be at least the average price of tickets
# for customers purchasing exactly one ticket
if df_output.loc[df_output[nb_tickets]==1].shape[0] > 0 :
avg_price = df_output.loc[df_output[nb_tickets]==1][total_amount].mean()
else :
avg_price = df_output[total_amount].mean()
# we compute the avg price of ticket for each customer
df_output["avg_ticket_price"] = df_output[total_amount]/df_output[nb_tickets]
# correct negatives total amounts
df_output.loc[:,"total_amount_corrected"] = np.where(df_output[total_amount] < 0,
avg_price * df_output[nb_tickets],
df_output[total_amount])
df_output.loc[:,"total_amount_projected"] = np.where(
# if no ticket bought in the past, we take the average price
df_output[nb_tickets]==0, avg_price,
# if avg prices of tickets are negative, we recompute the expected amount based on the avg price of a ticket
# observed on the whole population
np.where(X_test_segment["avg_ticket_price"] < 0, avg_price * df_output.loc[:,"nb_tickets_projected"],
# else, the amount projected is the average price of tickets bought by the customer * nb tickets projected
df_output["avg_ticket_price"] * df_output.loc[:,"nb_tickets_projected"])
)
df_output.loc[:,"nb_tickets_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"nb_tickets_projected"]
df_output.loc[:,"total_amount_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"total_amount_projected"]
df_output.loc[:,"pace_purchase"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)
return df_output
def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
duration_ref=17, duration_projection=12) :
"""
Generate a summary of expected customer sales based on segments.
Args:
- df (DataFrame): DataFrame containing customer data.
- segment (str): Name of the column in df representing customer segments.
- nb_tickets_expected (str): Name of the column in df representing the expected number of tickets.
- total_amount_expected (str): Name of the column in df representing the expected total amount.
- total_amount (str): Name of the column in df representing the total amount.
- pace_purchase (str) : Name of the column in df representing the average time between 2 purchases in months.
- duration_ref (int or float): Duration of the period of reference for the construction of the variables X.
- duration_projection (int or float): Duration of the period of projection of sales / revenue.
Returns:
DataFrame: Summary DataFrame containing expected customer sales metrics.
"""
# compute nb tickets estimated and total amount expected
df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()
# number of customers by segment
df_expected_CA.insert(1, "size", df.groupby(segment).size().values)
# size in percent of all customers
df_expected_CA.insert(2, "size_perct", 100 * df_expected_CA["size"]/df_expected_CA["size"].sum())
# compute share of CA recovered
duration_ratio=duration_ref/duration_projection
df_expected_CA["revenue_recovered_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
df.groupby(segment)[total_amount].sum().values
df_expected_CA["share_future_revenue_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
df[total_amount].sum()
df_drop_null_pace = df.dropna(subset=[pace_purchase])
df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values
return df_expected_CA
def save_file_s3_ca(File_name, type_of_activity):
"""
Saves a file in S3 storage.
Args:
- File_name (str)
- type_of_activity (str)
"""
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png', dpi=120)
image_buffer.seek(0)
PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/"
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()

335
utils_segmentation.py Normal file
View File

@ -0,0 +1,335 @@
# functions for segmentation and graphics associated
def load_model(type_of_activity, model):
"""
Loads from S3 storage the optimal parameters of the chosen ML model saved in a pickle file.
Args:
- type_of_activity (str)
- model (str)
Returns:
Model: machine learning model pre-trained with a scikit learn pipeline.
"""
BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
filename = model + '.pkl'
file_path = BUCKET + filename
with fs.open(file_path, mode="rb") as f:
model_bytes = f.read()
model = pickle.loads(model_bytes)
return model
def load_test_file(type_of_activity):
"""
Load the test dataset from S3 storage for the type of activity specified.
Args:
- type_of_activity (str)
Returns:
DataFrame: Test dataset.
"""
file_path_test = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}/Test_set.csv"
with fs.open(file_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
return dataset_test
def save_file_s3_mp(File_name, type_of_activity):
"""
Save a matplotlib figure to S3 storage to the location assigned for the type of activity specified.
Args:
- File_name (str)
- type_of_activity (str)
Returns:
None
"""
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png', dpi=110)
image_buffer.seek(0)
PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def save_txt_file_s3(file_name, type_of_activity, content):
"""
Save a text file to S3 storage to the location assigned for the type of activity specified.
Args:
- file_name (str)
- type_of_activity (str)
- content (str)
Returns:
None
"""
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_2_Segmentation_and_Marketing_Personae/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + file_name + type_of_activity + '.txt'
with fs.open(FILE_PATH_OUT_S3, 'w') as s3_file:
s3_file.write(content)
def df_business_fig(df, segment, list_var) :
"""
Compute business key performance indicators (KPIs) based on segment-wise aggregation of variables.
Args:
- df (DataFrame): The DataFrame containing data.
- segment (str): The column name representing segments.
- list_var (list of str): The list of variable names to be aggregated.
Returns:
DataFrame: The DataFrame containing business KPIs.
"""
df_business_kpi = df.groupby(segment)[list_var].sum().reset_index()
df_business_kpi.insert(1, "size", df.groupby(segment).size().values)
all_var = ["size"] + list_var
df_business_kpi[all_var] = 100 * df_business_kpi[all_var] / df_business_kpi[all_var].sum()
return df_business_kpi
def hist_segment_business_KPIs(df, segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns, type_of_activity) :
"""
Plot a histogram stacking the relative weight of each segment regarding some key business indicators.
Args:
- df (DataFrame): The DataFrame containing pre aggregated data about some key business indicators
- segment (str): The column name representing segments.
- size (str): The column name representing the size.
- nb_tickets (str): The column name representing the number of tickets.
- nb_purchases (str): The column name representing the number of purchases.
- total_amount (str): The column name representing the total amount.
- nb_campaigns (str): The column name representing the number of campaigns.
- type_of_activity (str)
Returns:
None
"""
plt.figure()
df_plot = df[[segment, size, nb_tickets, nb_purchases, total_amount, nb_campaigns]]
x = ["number of\ncustomers", "number of\ntickets", "number of\npurchases", "total\namount",
"number of\ncampaigns"]
bottom = np.zeros(5)
# types of blue color
colors = plt.cm.Blues(np.linspace(0.1, 0.9, 4))
for i in range(4) :
height = list(df_plot.loc[i,size:].values)
plt.bar(x=x, height=height, label = str(df_plot[segment][i]), bottom=bottom, color=colors[i])
bottom+=height
# Ajust margins
plt.subplots_adjust(left = 0.125, right = 0.8, bottom = 0.1, top = 0.9)
plt.legend(title = "segment", loc = "upper right", bbox_to_anchor=(1.2, 1))
plt.ylabel("Fraction represented by the segment (%)")
plt.title(f"Relative weight of each segment regarding business KPIs\nfor {type_of_activity} companies", size=12)
# plt.show()
# def df_segment_mp(df) :
# df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "country_fr"]].mean().reset_index()
# df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
# df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["share_known_gender"]))
# return df_mp
# def df_segment_pb (df) :
# df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
# return df_pb
def radar_mp_plot(df, categories, index) :
"""
Plot a radar chart describing marketing personae of the segment associated to index for the given categories, for the type of activity specified.
Args:
- df (DataFrame): The DataFrame containing data about categories describing the marketing personae associated to each segment
- categories (list of str):
- index (int): The index (between 0 and 3) identifying the segment. Here, index = number of the segment - 1
Returns:
None
"""
categories = categories
# true values are used to print the true value in parenthesis
tvalues = list(df.loc[index,categories])
max_values = df[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# Initialize graphic
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
# if we don't plot this transparent line, the radius of the circle will be too small
ax.plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax.plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5, linewidth=1.2)
# fill the sector
ax.fill(angles, values_normalized, color='orange', alpha=0.4)
# labels
ax.set_yticklabels([])
ax.set_xticks(angles)
ticks = [categories[i].replace("_"," ") + f"\n({round(100 * tvalues[i],2)}%)" for i in range(len(categories))]
ax.set_xticklabels(ticks, color="black")
ax.spines['polar'].set_visible(False)
plt.title(f'Characteristics of the segment {index+1}\n')
# plt.show()
def radar_mp_plot_all(df, type_of_activity) :
"""
Plot exactly the same radar charts as radar_mp_plot, but for all segments.
Args:
- df (DataFrame)
- type_of_activity (str)
Returns:
None
"""
# table summarizing variables relative to marketing personae
df_mp = df.groupby("segment")[["gender_female", "gender_male", "gender_other", "age"]].mean().reset_index()
#df_mp.insert(3, "share_known_gender", df_mp["gender_female"]+df_mp["gender_male"])
df_mp.insert(4, "share_of_women", df_mp["gender_female"]/(df_mp["gender_female"]+df_mp["gender_male"]))
# table relative to purchasing behaviour
df_pb = df.groupby("segment")[["prop_purchases_internet", "taux_ouverture_mail", "opt_in"]].mean().reset_index()
# concatenation of tables to prepare the plot
df_used = pd.concat([df_pb, df_mp[[ 'share_of_women', 'age']]], axis=1)
# rename columns for the plot
df_used = df_used.rename(columns={'taux_ouverture_mail': 'mails_opened', 'prop_purchases_internet': 'purchases_internet'})
# visualization
nb_segments = df_used.shape[0]
categories = list(df_used.drop("segment", axis=1).columns)
var_not_perc = ["age"]
# Initialize graphic
fig, ax = plt.subplots(2,2, figsize=(20, 21), subplot_kw=dict(polar=True))
for index in range(nb_segments) :
row = index // 2 # Division entière pour obtenir le numéro de ligne
col = index % 2
# true values are used to print the true value in parenthesis
tvalues = list(df_used.loc[index,categories])
max_values = df_used[categories].max()
# values are true values / max among the 4 segments, allows to
# put values in relation with the values for other segments
# if the point has a maximal abscisse it means that value is maximal for the segment considered
# , event if not equal to 1
values = list(df_used.loc[index,categories]/max_values)
# values normalized are used to adjust the value around the circle
# for instance if the maximum of values is equal to 0.8, we want the point to be
# at 8/10th of the circle radius, not at the edge
values_normalized = [ max(values) * elt for elt in values]
# Nb of categories
num_categories = len(categories)
angles = np.linspace(0, 2 * np.pi, num_categories, endpoint=False).tolist()
# we have to draw first a transparent line (alpha=0) of values to adjust the radius of the circle
# which is based on max(value)
# if we don't plot this transparent line, the radius of the circle will be too small
ax[row, col].plot(angles + angles[:1], values + values[:1], color='skyblue', alpha=0, linewidth=1.5)
ax[row, col].plot(angles + angles[:1], values_normalized + values_normalized[:1], color='black', alpha = 0.5,
linewidth=1.2)
# fill the sector
ax[row, col].fill(angles, values_normalized, color='orange', alpha=0.4, label = index)
# labels
ax[row, col].set_yticklabels([])
ax[row, col].set_xticks(angles)
# define the ticks
values_printed = [str(round(tvalues[i],2)) if categories[i] in var_not_perc else f"{round(100 * tvalues[i],2)}%" for i in range(len(categories))]
ticks = [categories[i].replace("_"," ") + f"\n({values_printed[i]})" for i in range(len(categories))]
ax[row, col].set_xticklabels(ticks, color="black", size = 20)
ax[row, col].spines['polar'].set_visible(False)
ax[row, col].set_title(f'Segment {index+1}\n', size = 24)
fig.suptitle(f"Characteristics of marketing personae of {type_of_activity} companies", size=32)
plt.tight_layout()
# plt.show()
def known_sociodemo_caracteristics(df, type_of_activity) :
"""
Compute the share of non-NaN values for some sociodemographic caracteristics features and save the result in a latex table.
Args:
- df (DataFrame)
- type_of_activity (str)
Returns:
None
"""
table_share_known = df.groupby("segment")[["is_profession_known", "is_zipcode_known", "categorie_age_inconnue", "gender_other"]].mean().mul(100).reset_index()
table_share_known.columns = ['Segment', 'Share of Known Profession (%)', 'Share of Known Zipcode (%)', 'Share of Unknown Age (%)', 'Share of Unknown Gender (%)']
table_share_known= table_share_known.pivot_table(index=None, columns='Segment')
# Arrondir les valeurs du DataFrame à une décimale
table_share_known_rounded = table_share_known.round(1)
# Convertir le DataFrame en format LaTeX avec les valeurs arrondies et le symbole '%'
latex_table = tabulate(table_share_known_rounded, headers='keys', tablefmt='latex_raw', floatfmt=".1f")
latex_table = latex_table.replace('%', '\\%')
save_txt_file_s3("table_known_socio_demo_caracteristics", type_of_activity, latex_table)

467
utils_stat_desc.py Normal file
View File

@ -0,0 +1,467 @@
def load_files(nb_compagnie):
"""
load and preprocess dataframes
"""
customer = pd.DataFrame()
campaigns_brut = pd.DataFrame()
campaigns_kpi = pd.DataFrame()
products = pd.DataFrame()
tickets = pd.DataFrame()
targets = pd.DataFrame()
for directory_path in nb_compagnie:
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_brut = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_target_information = display_input_databases(directory_path, file_name = "target_information")
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut, max_date=pd.Timestamp.now(tz='UTC'))
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
df_target_KPI = targets_KPI(df_target = df_target_information)
# Merge and
df_target_KPI = pd.merge(df_customerplus_clean_0[['customer_id']], df_target_KPI, how = 'left', on = 'customer_id')
targets_columns = list(df_target_KPI.columns)
targets_columns.remove('customer_id')
df_target_KPI[targets_columns] = df_target_KPI[targets_columns].fillna(0)
# Create company identifier
df_tickets_kpi["number_company"]=int(directory_path)
df_campaigns_brut["number_company"]=int(directory_path)
df_campaigns_kpi["number_company"]=int(directory_path)
df_customerplus_clean["number_company"]=int(directory_path)
df_target_information["number_company"]=int(directory_path)
df_target_KPI["number_company"]=int(directory_path)
# Clean index
df_tickets_kpi["customer_id"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')
df_campaigns_brut["customer_id"]= directory_path + '_' + df_campaigns_brut['customer_id'].astype('str')
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
# Remove companies' outliers
df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
# harmonize set of customers across databases
customer_id = df_tickets_kpi['customer_id'].to_list()
for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
dataset = dataset[dataset['customer_id'].isin(customer_id)]
df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str')
# Concatenation
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True)
tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
targets = pd.concat([targets, df_target_KPI], ignore_index=True)
return customer, campaigns_kpi, campaigns_brut, tickets, products, targets
def remove_outlier_total_amount(tickets : pd.DataFrame):
Q1 = tickets['total_amount'].quantile(0.25)
Q3 = tickets['total_amount'].quantile(0.75)
IQR = Q3 - Q1
upper = Q3 +1.5*IQR
outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
tickets = tickets[~tickets['customer_id'].isin(outliers)]
return tickets
def save_file_s3(File_name, type_of_activity):
"""
save plots into s3 storage
"""
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png', pad_inches=1, bbox_inches="tight", dpi = 150)
image_buffer.seek(0)
FILE_PATH = f"projet-bdc2324-team1/2_Output/2_0_Descriptive_Statistics/{type_of_activity}/"
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
def outlier_detection(tickets : pd.DataFrame, company_list, show_diagram=False):
"""
detect anonymous customers
"""
outlier_list = list()
for company in company_list:
total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
total_amount_share['CA'] = total_amount_share['total_amount'].sum()
total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
total_amount_share_index = total_amount_share.set_index('customer_id')
df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
#print('df circulaire : ', df_circulaire.head())
top = df_circulaire[:1]
#print('top : ', top)
outlier_list.append(top.index[0])
rest = df_circulaire[1:]
rest_sum = rest.sum()
new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
if show_diagram:
plt.figure(figsize=(3, 3))
plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
plt.axis('equal')
# plt.title(f'Répartition des montants totaux pour la compagnie {company}')
plt.show()
return outlier_list
def valid_customer_detection(products : pd.DataFrame, campaigns_brut : pd.DataFrame):
"""
identify customer that are in our time perimeter
"""
products_valid = products[products['purchase_date']>="2021-05-01"]
consumer_valid_product = products_valid['customer_id'].to_list()
campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
consumer_valid = consumer_valid_product + consumer_valid_campaigns
return consumer_valid
def identify_purchase_during_target_periode(products : pd.DataFrame):
"""
identify customer who purchased ticket during the target period
"""
products_target_period = products[(products['purchase_date']>="2022-11-01")
& (products['purchase_date']<="2023-11-01")]
customer_target_period = products_target_period['customer_id'].to_list()
return customer_target_period
def remove_elements(lst, elements_to_remove):
return ''.join([x for x in lst if x not in elements_to_remove])
def compute_nb_clients(customer: pd.DataFrame, type_of_activity: str):
company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
plt.figure(figsize=(4,3))
plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
plt.xlabel('Company Number')
plt.ylabel("Number of clients (thousands)")
# plt.title(f"Number of clients Across {type_of_activity} Companies")
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
plt.show()
save_file_s3("nb_clients_", type_of_activity)
def maximum_price_paid(customer: pd.DataFrame, type_of_activity: str):
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
plt.xlabel('Company Number')
plt.ylabel("Maximal price of a ticket Prix")
# plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies")
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
plt.show()
save_file_s3("Maximal_price_", type_of_activity)
def target_proportion(customer: pd.DataFrame, type_of_activity: str):
df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
'customer_id' : 'nunique'}).reset_index()
df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
plt.xlabel('Company Number')
plt.ylabel('Share (%)')
# plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
plt.show()
save_file_s3("share_target_", type_of_activity)
def mailing_consent(customer: pd.DataFrame, type_of_activity: str):
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
mailing_consent["opt_in"] *= 100
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
plt.xlabel('Company Number')
plt.ylabel('Mailing Consent (%)')
# plt.title(f'Consent of mailing Across {type_of_activity} Companies')
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
plt.show()
save_file_s3("mailing_consent_", type_of_activity)
def mailing_consent_by_target(customer: pd.DataFrame, type_of_activity: str):
df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
# Création du barplot groupé
fig, ax = plt.subplots(figsize=(5, 3))
categories = df_graph["number_company"].unique()
bar_width = 0.35
bar_positions = np.arange(len(categories))
# Grouper les données par label et créer les barres groupées
for label in df_graph["has_purchased_target_period"].unique():
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
label_printed = "Purchase" if label else "No purchase"
ax.bar(bar_positions, values, bar_width, label=label_printed)
# Mise à jour des positions des barres pour le prochain groupe
bar_positions = [pos + bar_width for pos in bar_positions]
# Ajout des étiquettes, de la légende, etc.
ax.set_xlabel('Company Number')
ax.set_ylabel('Mailing Consent (%)')
# ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies')
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories)
ax.legend()
# Affichage du plot
plt.show()
save_file_s3("mailing_consent_target_", type_of_activity)
def gender_bar(customer: pd.DataFrame, type_of_activity: str):
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
company_genders["gender_male"] *= 100
company_genders["gender_female"] *= 100
company_genders["gender_other"] *= 100
# Création du barplot
plt.figure(figsize=(4,3))
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male")
plt.bar(company_genders["number_company"], company_genders["gender_female"],
bottom = company_genders["gender_male"], label = "Female")
plt.bar(company_genders["number_company"], company_genders["gender_other"],
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown")
plt.xlabel('Company Number')
plt.ylabel("Frequency (%)")
# plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies")
plt.legend()
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
plt.show()
save_file_s3("gender_bar_", type_of_activity)
def country_bar(customer: pd.DataFrame, type_of_activity: str):
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
company_country_fr["country_fr"] *= 100
plt.figure(figsize=(4,3))
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
plt.xlabel('Company Number')
plt.ylabel("Share of French Customer (%)")
# plt.title(f"Share of French Customer Across {type_of_activity} Companies")
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
plt.show()
save_file_s3("country_bar_", type_of_activity)
def lazy_customer_plot(campaigns_kpi: pd.DataFrame, type_of_activity: str):
company_lazy_customers = campaigns_kpi.groupby("number_company")[["nb_campaigns", "taux_ouverture_mail"]].mean().reset_index()
company_lazy_customers["taux_ouverture_mail"] *= 100
# Initialize the figure
fig, ax1 = plt.subplots(figsize=(6, 3))
width = 0.4
x = range(len(company_lazy_customers))
# Plot the bars for "nb_campaigns" on the first y-axis
ax1.bar([i - width/2 for i in x], company_lazy_customers['nb_campaigns'], width=width, align='center', label='Amount of Campaigns', color = 'steelblue')
# Set labels and title for the first y-axis
ax1.set_ylabel('Number of Mails Received', color='steelblue')
ax1.tick_params(axis='y', labelcolor='steelblue')
# Create another y-axis for "taux_ouverture_mail"
ax2 = ax1.twinx()
# Plot the bars for "taux_ouverture_mail" on the second y-axis
ax2.bar([i + width/2 for i in x], company_lazy_customers['taux_ouverture_mail'], width=width, align='center', label='Open Mail Rate', color = 'darkorange')
# Set labels and title for the second y-axis
ax2.set_ylabel('Open Mail Rate (%)', color='darkorange')
ax2.tick_params(axis='y', labelcolor='darkorange')
# Set x-axis ticks and labels
ax1.set_xticks(x)
ax1.set_xticklabels(company_lazy_customers['number_company'])
plt.show()
save_file_s3("lazy_customer_", type_of_activity)
def campaigns_effectiveness(customer: pd.DataFrame, type_of_activity: str):
campaigns_effectiveness = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
fig, ax = plt.subplots(figsize=(5, 3))
categories = campaigns_effectiveness["number_company"].unique()
bar_width = 0.35
bar_positions = np.arange(len(categories))
# Grouper les données par label et créer les barres groupées
for label in campaigns_effectiveness["has_purchased_target_period"].unique():
label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
label_printed = "Purchase" if label else "No purchase"
ax.bar(bar_positions, values, bar_width, label=label_printed)
# Mise à jour des positions des barres pour le prochain groupe
bar_positions = [pos + bar_width for pos in bar_positions]
# Ajout des étiquettes, de la légende, etc.
ax.set_xlabel('Company Number')
ax.set_ylabel('Share of Consent (%)')
# ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)")
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
ax.set_xticklabels(categories)
ax.legend()
plt.show()
save_file_s3("campaigns_effectiveness_", type_of_activity)
def sale_dynamics(products : pd.DataFrame, campaigns_brut : pd.DataFrame, type_of_activity):
purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
# Mois du premier mails
first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
# Fusion
known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']],
first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
# Mois à partir duquel le client est considere comme connu
known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
# Nombre de commande par mois
purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
purchases_count = purchases_count[purchases_count['customer_id'] != 1]
# Nombre de commande par mois par type de client
nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
# Graphique en nombre de commande
purchases_graph = nb_purchases_graph
purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
plt.figure(figsize=(5.5,4))
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers")
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers")
# commande pr afficher slt
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
plt.xlabel('Month')
plt.ylabel("Number of Sales")
# plt.title(f"Number of Sales Across {type_of_activity} Companies")
plt.legend()
plt.show()
save_file_s3("sale_dynamics_", type_of_activity)
def tickets_internet(tickets: pd.DataFrame, type_of_activity: str):
nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
nb_tickets_internet['prop_purchases_internet'] *=100
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["prop_purchases_internet"])
plt.xlabel('Company Number')
plt.ylabel("Share of Purchases Bought Online (%)")
# plt.title(f"Share of Online Purchases Across {type_of_activity} Companies")
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
plt.show()
save_file_s3("tickets_internet_", type_of_activity)
def already_bought_online(tickets: pd.DataFrame, type_of_activity: str):
nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
'customer_id' : 'nunique'}
).reset_index())
nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100
plt.bar(nb_consumers_online["number_company"], nb_consumers_online["Share_consumers_internet"])
plt.xlabel('Company Number')
plt.ylabel("Share of Customer who Bought Online at least once (%)")
# plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies")
plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
plt.show()
save_file_s3("First_buy_internet_", type_of_activity)
def box_plot_price_tickets(tickets: pd.DataFrame, type_of_activity: str):
price_tickets = tickets[(tickets['total_amount'] > 0)]
plt.figure(figsize=(4,3))
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
# plt.title(f"Box plot of price tickets Across {type_of_activity} Companies")
plt.xlabel('Company Number')
plt.ylabel("Total Amount Spent")
plt.show()
save_file_s3("box_plot_price_tickets_", type_of_activity)
def target_description(targets : pd.DataFrame, type_of_activity: str):
describe_target = targets.groupby('number_company').agg(
prop_target_jeune=('target_jeune', lambda x: (x.sum() / x.count())*100),
prop_target_scolaire=('target_scolaire', lambda x: (x.sum() / x.count())*100),
prop_target_entreprise=('target_entreprise', lambda x: (x.sum() / x.count())*100),
prop_target_famille=('target_famille', lambda x: (x.sum() / x.count())*100),
prop_target_optin=('target_optin', lambda x: (x.sum() / x.count())*100),
prop_target_optout=('target_optout', lambda x: (x.sum() / x.count())*100),
prop_target_newsletter=('target_newsletter', lambda x: (x.sum() / x.count())*100),
prop_target_abonne=('target_abonne', lambda x: (x.sum() / x.count())*100))
plot = describe_target.plot.bar()
# Adding a title
# plot.set_title(f"Distribution of Targets by Category for {type_of_activity} companies")
# Adding labels for x and y axes
plot.set_xlabel("Company Number")
plot.set_ylabel("Target Proportion")
plot.set_xticklabels(plot.get_xticklabels(), rotation=0, horizontalalignment='center')
# Adding a legend
plot.legend(["Youth", "School", "Enterprise", "Family", "Optin", "Optout", "Newsletter", "Subscriber"], title="Target Category")
save_file_s3("target_category_proportion_", type_of_activity)