# Function de construction de KPI def custom_date_parser(date_string): return pd.to_datetime(date_string, utc = True, format = 'ISO8601') def display_databases(directory_path, file_name, datetime_col = None): """ This function returns the file from s3 storage """ file_path = "projet-bdc2324-team1" + "/0_Input/Company_" + directory_path + "/" + file_name + ".csv" print("File path : ", file_path) with fs.open(file_path, mode="rb") as file_in: df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser) return df def campaigns_kpi_function(campaigns_information = None): # Nombre de campagnes de mails nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True) # Temps d'ouverture en min moyen campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index() # Nombre de mail ouvert opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']] opened_campaign.dropna(subset=['opened_at'], inplace=True) opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True) # Fusion des indicateurs campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left') campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left') # Fill NaN values campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened']].fillna(0) # Remplir les NaT : time_to_open (??) return campaigns_reduced def tickets_kpi_function(tickets_information = None): tickets_information_copy = tickets_information.copy() # Dummy : Canal de vente en ligne liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int) # Proportion de vente en ligne prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index() prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True) # Average amount # avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types']) # .agg({"amount" : "mean"}).reset_index() # .rename(columns = {'amount' : 'avg_amount'})) tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']] .groupby(['customer_id']) .agg({'ticket_id': 'count', 'purchase_id' : 'nunique', 'amount' : 'sum', 'supplier_name': 'nunique', 'vente_internet' : 'max', 'purchase_date' : ['min', 'max']}) .reset_index() ) tickets_kpi.columns = tickets_kpi.columns.map('_'.join) tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', 'purchase_id_nunique' : 'nb_purchases', 'amount_sum' : 'total_amount', 'supplier_name_nunique' : 'nb_suppliers', 'customer_id_' : 'customer_id'}, inplace = True) tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min'] tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours # Convertir date et en chiffre max_date = tickets_kpi['purchase_date_max'].max() tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D') tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D') tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left') tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0) # tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id') #Taux de ticket payé par internet selon les compagnies #tickets_kpi["Taux_ticket_internet"] = tickets_kpi["nb_tickets_internet"]*100 / tickets_kpi["nb_tickets"] #tickets_kpi['Taux_ticket_internet'] = tickets_kpi['Taux_ticket_internet'].fillna(0) return tickets_kpi def customerplus_kpi_function(customerplus_clean = None): # KPI sur les données socio-demographique ## Le genre customerplus_clean["gender_label"] = customerplus_clean["gender"].map({ 0: 'female', 1: 'male', 2: 'other' }) gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int) customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1) ## Indicatrice si individue vit en France customerplus_clean["country_fr"] = customerplus_clean["country"].apply(lambda x : int(x=="fr") if pd.notna(x) else np.nan) # Dummy if the customer has a structure id (tags) # customerplus_clean['has_tags'] = customerplus_clean['structure_id'].apply(lambda x: 1 if not pd.isna(x) else 0) return customerplus_clean