107 lines
5.7 KiB
Python
107 lines
5.7 KiB
Python
# Function de construction de KPI
|
|
|
|
def custom_date_parser(date_string):
|
|
return pd.to_datetime(date_string, utc = True, format = 'ISO8601')
|
|
|
|
def display_input_databases(directory_path, file_name, datetime_col = None):
|
|
"""
|
|
This function returns the file from s3 storage
|
|
"""
|
|
file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv"
|
|
print("File path : ", file_path)
|
|
with fs.open(file_path, mode="rb") as file_in:
|
|
df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser)
|
|
return df
|
|
|
|
def campaigns_kpi_function(campaigns_information = None, max_date = None):
|
|
|
|
# Nombre de campagnes de mails
|
|
nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
|
nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
|
|
|
|
# Temps d'ouverture moyen (en minutes)
|
|
campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')
|
|
campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h'))
|
|
|
|
time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()
|
|
|
|
# Nombre de mail ouvert
|
|
opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]
|
|
opened_campaign.dropna(subset=['opened_at'], inplace=True)
|
|
opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
|
|
opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)
|
|
|
|
# Fusion des indicateurs
|
|
campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')
|
|
campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')
|
|
|
|
# Taux de mails ouvert
|
|
campaigns_reduced['taux_ouverture_mail'] = campaigns_reduced['nb_campaigns_opened'] / campaigns_reduced['nb_campaigns']
|
|
|
|
# Fill NaN values
|
|
campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0)
|
|
# Remplir les NaT : time_to_open (??)
|
|
|
|
return campaigns_reduced
|
|
|
|
|
|
def tickets_kpi_function(tickets_information = None):
|
|
|
|
tickets_information_copy = tickets_information.copy()
|
|
|
|
# Dummy : Canal de vente en ligne
|
|
liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
|
|
tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)
|
|
|
|
# Proportion de vente en ligne
|
|
prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['purchase_id'].nunique().reset_index()
|
|
prop_vente_internet.rename(columns = {'purchase_id' : 'nb_purchases_internet'}, inplace = True)
|
|
|
|
# Mixte KPI comportement achat
|
|
tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
|
|
.groupby(['customer_id'])
|
|
.agg(nb_tickets=('ticket_id', 'nunique'),
|
|
nb_purchases=('purchase_id', 'nunique'),
|
|
total_amount=('amount', 'sum'),
|
|
nb_suppliers=('supplier_name', 'nunique'),
|
|
achat_internet=('vente_internet', 'max'),
|
|
purchase_date_min=('purchase_date', 'min'),
|
|
purchase_date_max=('purchase_date', 'max'))
|
|
.reset_index())
|
|
|
|
tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
|
|
tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours
|
|
|
|
# Convertir date et en chiffre
|
|
max_date = tickets_kpi['purchase_date_max'].max()
|
|
tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')
|
|
tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')
|
|
|
|
# Proportion de ticket internet
|
|
tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left')
|
|
tickets_kpi['nb_purchases_internet'] = tickets_kpi['nb_purchases_internet'].fillna(0)
|
|
tickets_kpi['prop_purchases_internet'] = tickets_kpi['nb_purchases_internet'] / tickets_kpi['nb_purchases']
|
|
|
|
return tickets_kpi
|
|
|
|
def customerplus_kpi_function(customerplus_clean = None):
|
|
# KPI sur les données socio-demographique
|
|
|
|
# Le genre
|
|
customerplus_clean["gender_label"] = customerplus_clean["gender"].map({
|
|
0: 'female',
|
|
1: 'male',
|
|
2: 'other'
|
|
})
|
|
|
|
gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int)
|
|
customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)
|
|
|
|
## Indicatrice si individue vit en France
|
|
customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan)
|
|
|
|
# Dummy if the customer has a structure id (tags)
|
|
# customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0)
|
|
|
|
return customerplus_clean
|
|
|