correction, renommer, remplir NaN pour tickets et mail

This commit is contained in:
Antoine JOUBREL 2024-03-14 22:34:36 +00:00
parent ac6a3b365f
commit dc5e3d0df1
2 changed files with 33 additions and 85 deletions

View File

@ -22,52 +22,12 @@ exec(open('0_KPI_functions.py').read())
warnings.filterwarnings('ignore')
def display_covering_time(df, company, datecover):
"""
This function draws the time coverage of each company
"""
min_date = df['purchase_date'].min().strftime("%Y-%m-%d")
max_date = df['purchase_date'].max().strftime("%Y-%m-%d")
datecover[company] = [datetime.strptime(min_date, "%Y-%m-%d") + timedelta(days=x) for x in range((datetime.strptime(max_date, "%Y-%m-%d") - datetime.strptime(min_date, "%Y-%m-%d")).days)]
print(f'Couverture Company {company} : {min_date} - {max_date}')
return datecover
def compute_time_intersection(datecover):
"""
This function returns the time coverage for all companies
"""
timestamps_sets = [set(timestamps) for timestamps in datecover.values()]
intersection = set.intersection(*timestamps_sets)
intersection_list = list(intersection)
formated_dates = [dt.strftime("%Y-%m-%d") for dt in intersection_list]
return sorted(formated_dates)
def df_coverage_modelization(sport, coverage_features = 0.7):
"""
This function returns start_date, end_of_features and final dates
that help to construct train and test datasets
"""
datecover = {}
for company in sport:
df_products_purchased_reduced = display_databases(company, file_name = "products_purchased_reduced",
datetime_col = ['purchase_date'])
datecover = display_covering_time(df_products_purchased_reduced, company, datecover)
#print(datecover.keys())
dt_coverage = compute_time_intersection(datecover)
start_date = dt_coverage[0]
end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]
final_date = dt_coverage[-1]
return start_date, end_of_features, final_date
def dataset_construction(min_date, end_features_date, max_date, directory_path):
# Import customerplus
df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
# if directory_path == "101":
# df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date'])
@ -90,7 +50,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path):
# Fusion de l'ensemble et creation des KPI
# KPI sur les campagnes publicitaires
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information)
df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date)
# KPI sur le comportement d'achat
df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features)

View File

@ -3,7 +3,7 @@
def custom_date_parser(date_string):
return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
def display_databases(directory_path, file_name, datetime_col = None):
def display_input_databases(directory_path, file_name, datetime_col = None):
"""
This function returns the file from s3 storage
"""
@ -13,14 +13,16 @@ def display_databases(directory_path, file_name, datetime_col = None):
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
return df
def campaigns_kpi_function(campaigns_information = None):
def campaigns_kpi_function(campaigns_information = None, max_date = None):
# Nombre de campagnes de mails
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
# Temps d'ouverture moyen (en minutes)
campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')
campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
# Nombre de mail ouvert
@ -33,8 +35,11 @@ def campaigns_kpi_function(campaigns_information = None):
campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')
campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')
# Taux de mails ouvert
campaigns_reduced['taux_ouverture_mail'] = campaigns_reduced['nb_campaigns_opened'] / campaigns_reduced['nb_campaigns']
# Fill NaN values
campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)
campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
# Remplir les NaT : time_to_open (??)
return campaigns_reduced
@ -49,33 +54,20 @@ def tickets_kpi_function(tickets_information = None):
tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)
# Proportion de vente en ligne
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index()
prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)
# Average amount
# avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types'])
# .agg({"amount" : "mean"}).reset_index()
# .rename(columns = {'amount' : 'avg_amount'}))
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['purchase_id'].nunique().reset_index()
prop_vente_internet.rename(columns = {'purchase_id' : 'nb_purchases_internet'}, inplace = True)
# Mixte KPI comportement achat
tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
.groupby(['customer_id'])
.agg({'ticket_id': 'count',
'purchase_id' : 'nunique',
'amount' : 'sum',
'supplier_name': 'nunique',
'vente_internet' : 'max',
'purchase_date' : ['min', 'max']})
.reset_index()
)
tickets_kpi.columns = tickets_kpi.columns.map('_'.join)
tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets',
'purchase_id_nunique' : 'nb_purchases',
'amount_sum' : 'total_amount',
'supplier_name_nunique' : 'nb_suppliers',
'customer_id_' : 'customer_id'}, inplace = True)
.agg(nb_tickets=('ticket_id', 'nunique'),
nb_purchases=('purchase_id', 'nunique'),
total_amount=('amount', 'sum'),
nb_suppliers=('supplier_name', 'nunique'),
achat_internet=('vente_internet', 'max'),
purchase_date_min=('purchase_date', 'min'),
purchase_date_max=('purchase_date', 'max'))
.reset_index())
tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
@ -85,27 +77,23 @@ def tickets_kpi_function(tickets_information = None):
tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')
tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
# Proportion de ticket internet
tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)
# tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
#Taux de ticket payé par internet selon les compagnies
#tickets_kpi["Taux_ticket_internet"] = tickets_kpi["nb_tickets_internet"]*100 / tickets_kpi["nb_tickets"]
#tickets_kpi['Taux_ticket_internet'] = tickets_kpi['Taux_ticket_internet'].fillna(0)
tickets_kpi['nb_purchases_internet'] = tickets_kpi['nb_purchases_internet'].fillna(0)
tickets_kpi['prop_purchases_internet'] = tickets_kpi['nb_purchases_internet'] / tickets_kpi['nb_purchases']
return tickets_kpi
def customerplus_kpi_function(customerplus_clean = None):
# KPI sur les données socio-demographique
## Le genre
# Le genre
customerplus_clean["gender_label"] = customerplus_clean["gender"].map({
0: 'female',
1: 'male',
2: 'other'
})
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)