Merge branch 'main' into generalization

This commit is contained in:
Alexis REVELLE 2024-03-18 16:21:42 +00:00
commit 6ac62d9957
8 changed files with 10818 additions and 1829 deletions

View File

@ -30,7 +30,7 @@ def export_dataset(df, output_name):
df.to_csv(file_out, index = False) df.to_csv(file_out, index = False)
## 1 - Cleaning of the datasets ## 1 - Cleaning of the datasets
for tenant_id in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "101"]: for tenant_id in ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"]:#, "101"
# Timer # Timer
start = time.time() start = time.time()

View File

@ -22,52 +22,12 @@ exec(open('0_KPI_functions.py').read())
warnings.filterwarnings('ignore') warnings.filterwarnings('ignore')
def display_covering_time(df, company, datecover):
"""
This function draws the time coverage of each company
"""
min_date = df['purchase_date'].min().strftime("%Y-%m-%d")
max_date = df['purchase_date'].max().strftime("%Y-%m-%d")
datecover[company] = [datetime.strptime(min_date, "%Y-%m-%d") + timedelta(days=x) for x in range((datetime.strptime(max_date, "%Y-%m-%d") - datetime.strptime(min_date, "%Y-%m-%d")).days)]
print(f'Couverture Company {company} : {min_date} - {max_date}')
return datecover
def compute_time_intersection(datecover):
"""
This function returns the time coverage for all companies
"""
timestamps_sets = [set(timestamps) for timestamps in datecover.values()]
intersection = set.intersection(*timestamps_sets)
intersection_list = list(intersection)
formated_dates = [dt.strftime("%Y-%m-%d") for dt in intersection_list]
return sorted(formated_dates)
def df_coverage_modelization(sport, coverage_features = 0.7):
"""
This function returns start_date, end_of_features and final dates
that help to construct train and test datasets
"""
datecover = {}
for company in sport:
df_products_purchased_reduced = display_databases(company, file_name = "products_purchased_reduced",
datetime_col = ['purchase_date'])
datecover = display_covering_time(df_products_purchased_reduced, company, datecover)
#print(datecover.keys())
dt_coverage = compute_time_intersection(datecover)
start_date = dt_coverage[0]
end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]
final_date = dt_coverage[-1]
return start_date, end_of_features, final_date
def dataset_construction(min_date, end_features_date, max_date, directory_path): def dataset_construction(min_date, end_features_date, max_date, directory_path):
# Import customerplus # Import customerplus
df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned") df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at']) df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date']) df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
# if directory_path == "101": # if directory_path == "101":
# df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date']) # df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date'])
@ -90,7 +50,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
# Fusion de l'ensemble et creation des KPI # Fusion de l'ensemble et creation des KPI
# KPI sur les campagnes publicitaires # KPI sur les campagnes publicitaires
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
# KPI sur le comportement d'achat # KPI sur le comportement d'achat
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features) df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)

View File

@ -74,7 +74,7 @@ def preprocessing_customerplus(directory_path):
cleaning_date(customerplus_copy, 'last_visiting_date') cleaning_date(customerplus_copy, 'last_visiting_date')
# Selection des variables # Selection des variables
customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True) customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'language', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'mcp_contact_id', 'last_visiting_date', 'deleted_at'], axis = 1, inplace=True)
customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True) customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)
return customerplus_copy return customerplus_copy

View File

@ -3,7 +3,7 @@
def custom_date_parser(date_string): def custom_date_parser(date_string):
return pd.to_datetime(date_string, utc = True, format = 'ISO8601') return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
def display_databases(directory_path, file_name, datetime_col = None): def display_input_databases(directory_path, file_name, datetime_col = None):
""" """
This function returns the file from s3 storage This function returns the file from s3 storage
""" """
@ -13,14 +13,16 @@ def display_databases(directory_path, file_name, datetime_col = None):
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser) df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
return df return df
def campaigns_kpi_function(campaigns_information = None): def campaigns_kpi_function(campaigns_information = None, max_date = None):
# Nombre de campagnes de mails # Nombre de campagnes de mails
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True) nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
# Temps d'ouverture moyen (en minutes) # Temps d'ouverture moyen (en minutes)
campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index() time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
# Nombre de mail ouvert # Nombre de mail ouvert
@ -33,8 +35,11 @@ def campaigns_kpi_function(campaigns_information = None):
campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left') campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')
campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left') campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')
# Taux de mails ouvert
campaigns_reduced['taux_ouverture_mail'] = campaigns_reduced['nb_campaigns_opened'] / campaigns_reduced['nb_campaigns']
# Fill NaN values # Fill NaN values
campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened']].fillna(0) campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
# Remplir les NaT : time_to_open (??) # Remplir les NaT : time_to_open (??)
return campaigns_reduced return campaigns_reduced
@ -49,33 +54,20 @@ def tickets_kpi_function(tickets_information = None):
tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int) tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)
# Proportion de vente en ligne # Proportion de vente en ligne
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index() prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['purchase_id'].nunique().reset_index()
prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True) prop_vente_internet.rename(columns = {'purchase_id' : 'nb_purchases_internet'}, inplace = True)
# Average amount
# avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
# .agg({"amount" : "mean"}).reset_index()
# .rename(columns = {'amount' : 'avg_amount'}))
# Mixte KPI comportement achat
tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']] tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
.groupby(['customer_id']) .groupby(['customer_id'])
.agg({'ticket_id': 'count', .agg(nb_tickets=('ticket_id', 'nunique'),
'purchase_id' : 'nunique', nb_purchases=('purchase_id', 'nunique'),
'amount' : 'sum', total_amount=('amount', 'sum'),
'supplier_name': 'nunique', nb_suppliers=('supplier_name', 'nunique'),
'vente_internet' : 'max', achat_internet=('vente_internet', 'max'),
'purchase_date' : ['min', 'max']}) purchase_date_min=('purchase_date', 'min'),
.reset_index() purchase_date_max=('purchase_date', 'max'))
) .reset_index())
tickets_kpi.columns = tickets_kpi.columns.map('_'.join)
tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets',
'purchase_id_nunique' : 'nb_purchases',
'amount_sum' : 'total_amount',
'supplier_name_nunique' : 'nb_suppliers',
'customer_id_' : 'customer_id'}, inplace = True)
tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min'] tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
@ -85,22 +77,17 @@ def tickets_kpi_function(tickets_information = None):
tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D') tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')
tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D') tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
# Proportion de ticket internet
tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left') tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0) tickets_kpi['nb_purchases_internet'] = tickets_kpi['nb_purchases_internet'].fillna(0)
tickets_kpi['prop_purchases_internet'] = tickets_kpi['nb_purchases_internet'] / tickets_kpi['nb_purchases']
# tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
#Taux de ticket payé par internet selon les compagnies
#tickets_kpi["Taux_ticket_internet"] = tickets_kpi["nb_tickets_internet"]*100 / tickets_kpi["nb_tickets"]
#tickets_kpi['Taux_ticket_internet'] = tickets_kpi['Taux_ticket_internet'].fillna(0)
return tickets_kpi return tickets_kpi
def customerplus_kpi_function(customerplus_clean = None): def customerplus_kpi_function(customerplus_clean = None):
# KPI sur les données socio-demographique # KPI sur les données socio-demographique
## Le genre
# Le genre
customerplus_clean["gender_label"] = customerplus_clean["gender"].map({ customerplus_clean["gender_label"] = customerplus_clean["gender"].map({
0: 'female', 0: 'female',
1: 'male', 1: 'male',
@ -109,9 +96,28 @@ def customerplus_kpi_function(customerplus_clean = None):
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int) gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1) customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
## Indicatrice si individue vit en France # Age
customerplus_clean['categorie_age_0_10'] = ((customerplus_clean['age'] >= 0) & (customerplus_clean['age'] < 10)).astype(int)
customerplus_clean['categorie_age_10_20'] = ((customerplus_clean['age'] >= 10) & (customerplus_clean['age'] < 20)).astype(int)
customerplus_clean['categorie_age_20_30'] = ((customerplus_clean['age'] >= 20) & (customerplus_clean['age'] < 30)).astype(int)
customerplus_clean['categorie_age_30_40'] = ((customerplus_clean['age'] >= 30) & (customerplus_clean['age'] < 40)).astype(int)
customerplus_clean['categorie_age_40_50'] = ((customerplus_clean['age'] >= 40) & (customerplus_clean['age'] < 50)).astype(int)
customerplus_clean['categorie_age_50_60'] = ((customerplus_clean['age'] >= 50) & (customerplus_clean['age'] < 60)).astype(int)
customerplus_clean['categorie_age_60_70'] = ((customerplus_clean['age'] >= 60) & (customerplus_clean['age'] < 70)).astype(int)
customerplus_clean['categorie_age_70_80'] = ((customerplus_clean['age'] >= 70) & (customerplus_clean['age'] < 80)).astype(int)
customerplus_clean['categorie_age_plus_80'] = (customerplus_clean['age'] >= 80).astype(int)
customerplus_clean['categorie_age_inconnue'] = customerplus_clean['age'].apply(lambda x: 1 if pd.isna(x) else 0)
# Consentement au mailing
customerplus_clean['opt_in'] = customerplus_clean['opt_in'].astype(int)
# Indicatrice si individue vit en France
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan) customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
customerplus_clean['is_profession_known'] = customerplus_clean['profession'].notna().astype(int)
customerplus_clean['is_zipcode_known'] = customerplus_clean['zipcode'].notna().astype(int)
# Dummy if the customer has a structure id (tags) # Dummy if the customer has a structure id (tags)
# customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0) # customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)

View File

@ -524,6 +524,65 @@
"export_in_temporary(target_agg, 'Target_kpi_concatenate')" "export_in_temporary(target_agg, 'Target_kpi_concatenate')"
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "cb6f06e6-78de-4b8d-a103-8366eff0493a",
"metadata": {},
"outputs": [],
"source": [
"v"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5e864b1-adad-4267-b956-3f7ef371d677",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def display_covering_time(df, company, datecover):\n",
" \"\"\"\n",
" This function draws the time coverage of each company\n",
" \"\"\"\n",
" min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
" max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
" datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
" print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
" return datecover\n",
"\n",
"\n",
"def compute_time_intersection(datecover):\n",
" \"\"\"\n",
" This function returns the time coverage for all companies\n",
" \"\"\"\n",
" timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
" intersection = set.intersection(*timestamps_sets)\n",
" intersection_list = list(intersection)\n",
" formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
" return sorted(formated_dates)\n",
"\n",
"\n",
"def df_coverage_modelization(sport, coverage_features = 0.7):\n",
" \"\"\"\n",
" This function returns start_date, end_of_features and final dates\n",
" that help to construct train and test datasets\n",
" \"\"\"\n",
" datecover = {}\n",
" for company in sport:\n",
" df_products_purchased_reduced = display_input_databases(company, file_name = \"products_purchased_reduced\",\n",
" datetime_col = ['purchase_date'])\n",
" datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
" #print(datecover.keys())\n",
" dt_coverage = compute_time_intersection(datecover)\n",
" start_date = dt_coverage[0]\n",
" end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
" final_date = dt_coverage[-1]\n",
" return start_date, end_of_features, final_date\n",
" "
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "2435097a-95a5-43e1-84d0-7f6b701441ba", "id": "2435097a-95a5-43e1-84d0-7f6b701441ba",

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long