diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py index 9d246cd..6881072 100644 --- a/0_2_Dataset_construction.py +++ b/0_2_Dataset_construction.py @@ -22,52 +22,12 @@ exec(open('0_KPI_functions.py').read()) warnings.filterwarnings('ignore') -def display_covering_time(df, company, datecover): - """ - This function draws the time coverage of each company - """ - min_date = df['purchase_date'].min().strftime("%Y-%m-%d") - max_date = df['purchase_date'].max().strftime("%Y-%m-%d") - datecover[company] = [datetime.strptime(min_date, "%Y-%m-%d") + timedelta(days=x) for x in range((datetime.strptime(max_date, "%Y-%m-%d") - datetime.strptime(min_date, "%Y-%m-%d")).days)] - print(f'Couverture Company {company} : {min_date} - {max_date}') - return datecover - - -def compute_time_intersection(datecover): - """ - This function returns the time coverage for all companies - """ - timestamps_sets = [set(timestamps) for timestamps in datecover.values()] - intersection = set.intersection(*timestamps_sets) - intersection_list = list(intersection) - formated_dates = [dt.strftime("%Y-%m-%d") for dt in intersection_list] - return sorted(formated_dates) - - -def df_coverage_modelization(sport, coverage_features = 0.7): - """ - This function returns start_date, end_of_features and final dates - that help to construct train and test datasets - """ - datecover = {} - for company in sport: - df_products_purchased_reduced = display_databases(company, file_name = "products_purchased_reduced", - datetime_col = ['purchase_date']) - datecover = display_covering_time(df_products_purchased_reduced, company, datecover) - #print(datecover.keys()) - dt_coverage = compute_time_intersection(datecover) - start_date = dt_coverage[0] - end_of_features = dt_coverage[int(0.7 * len(dt_coverage))] - final_date = dt_coverage[-1] - return start_date, end_of_features, final_date - - def dataset_construction(min_date, end_features_date, max_date, directory_path): # Import customerplus - df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned") - df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at']) - df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date']) + df_customerplus_clean_0 = display_input_databases(directory_path, file_name = "customerplus_cleaned") + df_campaigns_information = display_input_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at']) + df_products_purchased_reduced = display_input_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date']) # if directory_path == "101": # df_products_purchased_reduced_1 = display_databases(directory_path, file_name = "products_purchased_reduced_1", datetime_col = ['purchase_date']) @@ -90,7 +50,7 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path): # Fusion de l'ensemble et creation des KPI # KPI sur les campagnes publicitaires - df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) + df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information, max_date = end_features_date) # KPI sur le comportement d'achat df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_features) diff --git a/0_KPI_functions.py b/0_KPI_functions.py index f991ced..26b6814 100644 --- a/0_KPI_functions.py +++ b/0_KPI_functions.py @@ -3,7 +3,7 @@ def custom_date_parser(date_string): return pd.to_datetime(date_string, utc = True, format = 'ISO8601') -def display_databases(directory_path, file_name, datetime_col = None): +def display_input_databases(directory_path, file_name, datetime_col = None): """ This function returns the file from s3 storage """ @@ -13,17 +13,19 @@ def display_databases(directory_path, file_name, datetime_col = None): df = pd.read_csv(file_in, sep=",", parse_dates = datetime_col, date_parser=custom_date_parser) return df -def campaigns_kpi_function(campaigns_information = None): +def campaigns_kpi_function(campaigns_information = None, max_date = None): # Nombre de campagnes de mails nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True) # Temps d'ouverture moyen (en minutes) - campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') + campaigns_information['time_to_open'] = (pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h') + campaigns_information['time_to_open'] = campaigns_information['time_to_open'].fillna((pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') - pd.to_datetime(max_date, utc = True, format = 'ISO8601')) / np.timedelta64(1, 'h')) + time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index() - # Nombre de mail ouvert + # Nombre de mail ouvert opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']] opened_campaign.dropna(subset=['opened_at'], inplace=True) opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() @@ -33,8 +35,11 @@ def campaigns_kpi_function(campaigns_information = None): campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left') campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left') + # Taux de mails ouvert + campaigns_reduced['taux_ouverture_mail'] = campaigns_reduced['nb_campaigns_opened'] / campaigns_reduced['nb_campaigns'] + # Fill NaN values - campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened']].fillna(0) + campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']] = campaigns_reduced[['nb_campaigns', 'nb_campaigns_opened', 'taux_ouverture_mail']].fillna(0) # Remplir les NaT : time_to_open (??) return campaigns_reduced @@ -49,34 +54,21 @@ def tickets_kpi_function(tickets_information = None): tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int) # Proportion de vente en ligne - prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index() - prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True) + prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['purchase_id'].nunique().reset_index() + prop_vente_internet.rename(columns = {'purchase_id' : 'nb_purchases_internet'}, inplace = True) - # Average amount - # avg_amount = (tickets_information_copy.groupby(["event_type_id", 'name_event_types']) - # .agg({"amount" : "mean"}).reset_index() - # .rename(columns = {'amount' : 'avg_amount'})) - - + # Mixte KPI comportement achat tickets_kpi = (tickets_information_copy[['customer_id', 'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']] .groupby(['customer_id']) - .agg({'ticket_id': 'count', - 'purchase_id' : 'nunique', - 'amount' : 'sum', - 'supplier_name': 'nunique', - 'vente_internet' : 'max', - 'purchase_date' : ['min', 'max']}) - .reset_index() - ) - - tickets_kpi.columns = tickets_kpi.columns.map('_'.join) - - tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', - 'purchase_id_nunique' : 'nb_purchases', - 'amount_sum' : 'total_amount', - 'supplier_name_nunique' : 'nb_suppliers', - 'customer_id_' : 'customer_id'}, inplace = True) - + .agg(nb_tickets=('ticket_id', 'nunique'), + nb_purchases=('purchase_id', 'nunique'), + total_amount=('amount', 'sum'), + nb_suppliers=('supplier_name', 'nunique'), + achat_internet=('vente_internet', 'max'), + purchase_date_min=('purchase_date', 'min'), + purchase_date_max=('purchase_date', 'max')) + .reset_index()) + tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min'] tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours @@ -85,27 +77,23 @@ def tickets_kpi_function(tickets_information = None): tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D') tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D') - + # Proportion de ticket internet tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id'], how = 'left') - tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0) - - # tickets_kpi = tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id') - - #Taux de ticket payé par internet selon les compagnies - - #tickets_kpi["Taux_ticket_internet"] = tickets_kpi["nb_tickets_internet"]*100 / tickets_kpi["nb_tickets"] - #tickets_kpi['Taux_ticket_internet'] = tickets_kpi['Taux_ticket_internet'].fillna(0) + tickets_kpi['nb_purchases_internet'] = tickets_kpi['nb_purchases_internet'].fillna(0) + tickets_kpi['prop_purchases_internet'] = tickets_kpi['nb_purchases_internet'] / tickets_kpi['nb_purchases'] return tickets_kpi def customerplus_kpi_function(customerplus_clean = None): - # KPI sur les données socio-demographique - ## Le genre + # KPI sur les données socio-demographique + + # Le genre customerplus_clean["gender_label"] = customerplus_clean["gender"].map({ 0: 'female', 1: 'male', 2: 'other' }) + gender_dummies = pd.get_dummies(customerplus_clean["gender_label"], prefix='gender').astype(int) customerplus_clean = pd.concat([customerplus_clean, gender_dummies], axis=1)