# Business Data Challenge - Team 1

In [1]:
import pandas as pd
import numpy as np
import os
import s3fs
import re
import warnings

Configuration de l'accès aux données

In [2]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

In [3]:
# Ignore warning
warnings.filterwarnings('ignore')

# Exemple sur Company 1

## Chargement données

In [4]:
BUCKET = "bdc2324-data/1"
liste_database = fs.ls(BUCKET)

In [5]:
# loop to create dataframes from liste

files_path = liste_database

client_number = files_path[0].split("/")[1]
df_prefix = "df" + str(client_number) + "_"

for i in range(len(files_path)) :
    current_path = files_path[i]
    with fs.open(current_path, mode="rb") as file_in:
        df = pd.read_csv(file_in)
        # the pattern of the name is df1xxx
        nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3)
        globals()[nom_dataframe] = df

## Cleaning functions

In [6]:
def cleaning_date(df, column_name):
    """
    Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.

    Parameters:
    - df: DataFrame
        Le DataFrame contenant la colonne à nettoyer.
    - column_name: str
        Le nom de la colonne à nettoyer.

    Returns:
    - DataFrame
        Le DataFrame modifié avec la colonne nettoyée.
    """
    df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')
    return df

## Preprocessing

## customer_plus

In [7]:
def preprocessing_customerplus(customerplus = None):

    customerplus_copy = customerplus.copy()
    
    # Passage en format date
    cleaning_date(customerplus_copy, 'first_buying_date')
    cleaning_date(customerplus_copy, 'last_visiting_date')
    
    # Selection des variables
    customerplus_copy.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)
    customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True)

    return customerplus_copy


In [8]:
df1_customerplus_clean = preprocessing_customerplus(df1_customersplus)

## Ticket area

In [9]:
# Fonction de nettoyage et selection
def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):
    # Base des tickets
    tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]
    tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)

    # Base des fournisseurs
    suppliers = suppliers[['id', 'name']]
    suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)
    suppliers['supplier_name'] = suppliers['supplier_name'].fillna('')

    # Base des types de billets
    type_ofs = type_ofs[['id', 'name', 'children']]
    type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)

    # Base des achats
    # Nettoyage de la date d'achat
    cleaning_date(purchases, 'purchase_date')
    # Selection des variables
    purchases = purchases[['id', 'purchase_date', 'customer_id']]

    # Fusions 
    # Fusion avec fournisseurs
    ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')
    ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)
    
    # Fusion avec type de tickets
    ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')
    ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)
    
    # Fusion avec achats
    ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')
    ticket_information.drop(['id'], axis = 1, inplace=True)

    return ticket_information

In [10]:
df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)

In [11]:
df1_ticket_information.head()

Unnamed: 0,ticket_id,purchase_id,product_id,is_from_subscription,supplier_name,type_of_ticket_name,children,purchase_date,customer_id
0,13070859,5107462,225251,False,vente en ligne,Atelier,pricing_formula,2018-12-28 14:47:50+00:00,48187
1,13070860,5107462,224914,False,vente en ligne,Atelier,pricing_formula,2018-12-28 14:47:50+00:00,48187
2,13070861,5107462,224914,False,vente en ligne,Atelier,pricing_formula,2018-12-28 14:47:50+00:00,48187
3,13070862,5107462,224914,False,vente en ligne,Atelier,pricing_formula,2018-12-28 14:47:50+00:00,48187
4,13070863,5107462,224914,False,vente en ligne,Atelier,pricing_formula,2018-12-28 14:47:50+00:00,48187


## Target area

In [12]:
def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None):
    # Target.csv cleaning
    targets = targets[["id", "target_type_id", "name"]]
    targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)
    
    # target_type cleaning
    target_types = target_types[["id","is_import","name"]].add_prefix("target_type_")
    
    #customer_target_mappings cleaning
    customer_target_mappings = customer_target_mappings[["id", "customer_id", "target_id"]]
    
    # Merge target et target_type
    targets_full = pd.merge(targets, target_types, left_on='target_type_id', right_on='target_type_id', how='inner')
    targets_full.drop(['target_type_id'], axis = 1, inplace=True)
    
    # Merge
    targets_full = pd.merge(customer_target_mappings, targets_full, left_on='target_id', right_on='target_id', how='inner')
    targets_full.drop(['target_id'], axis = 1, inplace=True)

    return targets_full

In [13]:
df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings)

## Campaings area

In [14]:
def preprocessing_campaigns_area(campaign_stats = None, campaigns = None):
    # campaign_stats cleaning 
    campaign_stats = campaign_stats[["id", "campaign_id", "customer_id", "opened_at", "sent_at", "delivered_at"]]
    cleaning_date(campaign_stats, 'opened_at')
    cleaning_date(campaign_stats, 'sent_at')
    cleaning_date(campaign_stats, 'delivered_at')
    
    # campaigns cleaning
    campaigns = campaigns[["id", "name", "service_id", "sent_at"]].add_prefix("campaign_")
    cleaning_date(campaigns, 'campaign_sent_at')
    
    # Merge 
    campaigns_full = pd.merge(campaign_stats, campaigns, on = "campaign_id", how = "left")
    campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)

    return campaigns_full

In [15]:
df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns)

In [16]:
df1_campaigns_information.head()

Unnamed: 0,id,customer_id,opened_at,sent_at,delivered_at,campaign_name,campaign_service_id,campaign_sent_at
0,19793,112597,NaT,2021-03-28 16:01:09+00:00,2021-03-28 16:24:18+00:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-27 23:00:00+00:00
1,14211,113666,NaT,2021-03-28 16:01:09+00:00,2021-03-28 16:21:02+00:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-27 23:00:00+00:00
2,13150,280561,NaT,2021-03-28 16:00:59+00:00,2021-03-28 16:08:45+00:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-27 23:00:00+00:00
3,7073,101007,2021-03-28 18:11:06+00:00,2021-03-28 16:00:59+00:00,2021-03-28 16:09:47+00:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-27 23:00:00+00:00
4,5175,103972,NaT,2021-03-28 16:01:06+00:00,2021-03-28 16:05:03+00:00,"Le Mucem chez vous, gardons le lien #22",404,2021-03-27 23:00:00+00:00


## Product area

Some useful functions

In [17]:
BUCKET = "bdc2324-data"
directory_path = '1'

In [18]:
def display_databases(file_name):
    """
    This function returns the file from s3 storage
    """
    file_path = BUCKET + "/" + directory_path + "/" + file_name
    print("File path : ", file_path)
    with fs.open(file_path, mode="rb") as file_in:
        df = pd.read_csv(file_in, sep=",")
        
    print("Shape : ", df.shape)
    return df


def remove_horodates(df):
    """
    this function remove horodate columns like created_at and updated_at
    """
    df = df.drop(columns = ["created_at", "updated_at"])
    return df


def order_columns_id(df):
    """
    this function puts all id columns at the beginning in order to read the dataset easier
    """
    substring = 'id'
    id_columns = [col for col in df.columns if substring in col]
    remaining_col = [col for col in df.columns if substring not in col]
    new_order = id_columns + remaining_col
    return df[new_order]


def process_df_2(df):
    """
    This function organizes dataframe
    """
    df = remove_horodates(df)
    print("Number of columns : ", len(df.columns))
    df = order_columns_id(df)
    print("Columns : ", df.columns)
    return df

def load_dataset(name):
    """
    This function loads csv file
    """
    df = display_databases(name)
    df = process_df_2(df)
    # drop na :
    #df = df.dropna(axis=1, thresh=len(df))
    # if identifier in table : delete it
    if 'identifier' in df.columns:
        df = df.drop(columns = 'identifier')
    return df

Create theme tables

In [19]:
def create_products_table():
    # first merge products and categories
    print("first merge products and categories")
    products = load_dataset("1products.csv")
    categories = load_dataset("1categories.csv")
    # Drop useless columns
    products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])
    categories = categories.drop(columns = ['extra_field', 'quota'])

    #Merge
    products_theme = products.merge(categories, how = 'left', left_on = 'category_id',
                                    right_on = 'id', suffixes=('_products', '_categories'))
    products_theme = products_theme.rename(columns = {"name" : "name_categories"})
    
    # Second merge products_theme and type of categories
    print("Second merge products_theme and type of categories")
    type_of_categories = load_dataset("1type_of_categories.csv")
    type_of_categories = type_of_categories.drop(columns = 'id')
    products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
                                          right_on = 'category_id' )

    # Index cleaning
    products_theme = products_theme.drop(columns = ['id_categories'])
    products_theme  = order_columns_id(products_theme)
    return products_theme


def create_events_table():
    # first merge events and seasons : 
    print("first merge events and seasons : ")
    events = load_dataset("1events.csv")
    seasons = load_dataset("1seasons.csv")

    # Drop useless columns
    events = events.drop(columns = ['manual_added', 'is_display'])
    seasons = seasons.drop(columns = ['start_date_time'])
        
    events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))

    # Secondly merge events_theme and event_types
    print("Secondly merge events_theme and event_types : ")
    event_types = load_dataset("1event_types.csv")
    event_types = event_types.drop(columns = ['fidelity_delay'])
    
    events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))
    events_theme = events_theme.rename(columns = {"name" : "name_event_types"})
    events_theme = events_theme.drop(columns = 'id')

    # thirdly merge events_theme and facilities
    print("thirdly merge events_theme and facilities : ")
    facilities = load_dataset("1facilities.csv")
    facilities = facilities.drop(columns = ['fixed_capacity'])
    
    events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))
    events_theme = events_theme.rename(columns = {"name" : "name_facilities", "id_events" : "event_id"})
    events_theme = events_theme.drop(columns = 'id')

    # Index cleaning
    events_theme = events_theme.drop(columns = ['id_seasons'])
    events_theme  = order_columns_id(events_theme)
    return events_theme


def create_representations_table():
    representations = load_dataset("1representations.csv")
    representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
                                                     'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
                                                     'representation_type_id'])
    
    representations_capacity = load_dataset("1representation_category_capacities.csv")
    representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])

    representations_theme = representations.merge(representations_capacity, how='left',
                                                  left_on='id', right_on='representation_id',
                                                  suffixes=('_representation', '_representation_cap'))
    # index cleaning
    representations_theme = representations_theme.drop(columns = ["id_representation"])
    representations_theme = order_columns_id(representations_theme)
    return representations_theme

In [20]:
products_theme = create_products_table()
products_theme.head()

first merge products and categories
File path :  bdc2324-data/1/1products.csv
Shape :  (94803, 14)
Number of columns :  12
Columns :  Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',
       'products_group_id', 'product_pack_id', 'identifier', 'amount',
       'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],
      dtype='object')
File path :  bdc2324-data/1/1categories.csv
Shape :  (27, 7)
Number of columns :  5
Columns :  Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')
Second merge products_theme and type of categories
File path :  bdc2324-data/1/1type_of_categories.csv
Shape :  (5, 6)
Number of columns :  4
Columns :  Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')


Unnamed: 0,id_products,representation_id,pricing_formula_id,category_id,products_group_id,product_pack_id,type_of_id,amount,is_full_price,name_categories
0,10682,914,114,41,10655,1,,9.0,False,indiv activité tr
1,478,273,131,1,471,1,12.0,9.5,False,indiv entrées tp
2,20873,275,137,1,20825,1,12.0,11.5,False,indiv entrées tp
3,157142,82519,9,5,156773,1,,8.0,False,indiv entrées tr
4,1341,9,93,1,1175,1,12.0,8.5,False,indiv entrées tp


In [21]:
events_theme= create_events_table()
events_theme.head()

first merge events and seasons : 
File path :  bdc2324-data/1/1events.csv
Shape :  (1232, 12)
Number of columns :  10
Columns :  Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',
       'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],
      dtype='object')
File path :  bdc2324-data/1/1seasons.csv
Shape :  (13, 6)
Number of columns :  4
Columns :  Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')
Secondly merge events_theme and event_types : 
File path :  bdc2324-data/1/1event_types.csv
Shape :  (9, 6)
Number of columns :  4
Columns :  Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')
thirdly merge events_theme and facilities : 
File path :  bdc2324-data/1/1facilities.csv
Shape :  (2, 7)
Number of columns :  5
Columns :  Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')


Unnamed: 0,event_id,season_id,facility_id,event_type_id,event_type_key_id,facility_key_id,street_id,name_events,name_seasons,name_event_types,name_facilities
0,192,16,1,4,4,1,1,frontières,2018,spectacle vivant,mucem
1,30329,2767,1,5,5,1,1,visite guidée une autre histoire du monde (1h00),2023,offre muséale groupe,mucem
2,161,16,1,2,2,1,1,visite contée les chercheurs d'or indiv,2018,offre muséale individuel,mucem
3,5957,582,1,4,4,1,1,we dreamt of utopia and we woke up screaming.,2021,spectacle vivant,mucem
4,8337,582,1,4,4,1,1,jeff koons épisodes 4,2021,spectacle vivant,mucem


In [22]:
representation_theme = create_representations_table()
representation_theme.head()

File path :  bdc2324-data/1/1representations.csv
Shape :  (36095, 16)
Number of columns :  14
Columns :  Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',
       'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',
       'is_display', 'expected_filling', 'max_filling', 'extra_field'],
      dtype='object')
File path :  bdc2324-data/1/1representation_category_capacities.csv
Shape :  (65241, 7)
Number of columns :  5
Columns :  Index(['id', 'representation_id', 'category_id', 'expected_filling',
       'max_filling'],
      dtype='object')


Unnamed: 0,event_id,id_representation_cap,representation_id,category_id
0,12384,123058,84820,2
1,37,2514,269,2
2,37,384,269,5
3,37,2515,269,10
4,37,383,269,1


Create uniform product database 

In [23]:
def uniform_product_df():
    """
    This function returns the uniform product dataset
    """
    print("Products theme columns : ", products_theme.columns)
    print("\n Representation theme columns : ", representation_theme.columns)
    print("\n Events theme columns : ", events_theme.columns)

    products_global = products_theme.merge(representation_theme, how='left',
                                           on= ["representation_id", "category_id"])
    
    products_global = products_global.merge(events_theme, how='left', on='event_id',
                                            suffixes = ("_representation", "_event"))
    
    products_global = order_columns_id(products_global)

    # remove useless columns 
    products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories'
    return products_global

In [24]:
products_global = uniform_product_df()
products_global.head()

Products theme columns :  Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',
       'products_group_id', 'product_pack_id', 'type_of_id', 'amount',
       'is_full_price', 'name_categories'],
      dtype='object')

 Representation theme columns :  Index(['event_id', 'id_representation_cap', 'representation_id',
       'category_id'],
      dtype='object')

 Events theme columns :  Index(['event_id', 'season_id', 'facility_id', 'event_type_id',
       'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',
       'name_seasons', 'name_event_types', 'name_facilities'],
      dtype='object')


Unnamed: 0,id_products,representation_id,pricing_formula_id,category_id,products_group_id,product_pack_id,event_id,id_representation_cap,season_id,facility_id,...,event_type_key_id,facility_key_id,street_id,amount,is_full_price,name_categories,name_events,name_seasons,name_event_types,name_facilities
0,10682,914,114,41,10655,1,132,8789,4,1,...,5,1,1,9.0,False,indiv activité tr,"visite-jeu ""le classico des minots"" (1h30)",2017.0,offre muséale individuel,mucem
1,478,273,131,1,471,1,37,390,2,1,...,2,1,1,9.5,False,indiv entrées tp,billet mucem picasso,2016.0,offre muséale individuel,mucem
2,20873,275,137,1,20825,1,37,395,2,1,...,2,1,1,11.5,False,indiv entrées tp,billet mucem picasso,2016.0,offre muséale individuel,mucem
3,157142,82519,9,5,156773,1,12365,120199,1754,1,...,4,1,1,8.0,False,indiv entrées tr,,,offre muséale individuel,mucem
4,1341,9,93,1,1175,1,8,21,4,1,...,6,1,1,8.5,False,indiv entrées tp,non défini,2017.0,non défini,mucem


In [26]:
# Fusion liée au product
df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner')

# Selection des variables d'intérêts
df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']]

# Construction des variables explicatives

## KPI campaigns

In [27]:
def campaigns_kpi_function(campaigns_information = None):
    # Nombre de campagnes de mails
    nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
    nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True)
    # Temps d'ouverture en min moyen    
    campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at']
    time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index()

    # Nombre de mail ouvert    
    opened_campaign = campaigns_information[['customer_id', 'campaign_name', 'opened_at']]
    opened_campaign.dropna(subset=['opened_at'], inplace=True)
    opened_campaign = opened_campaign[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index()
    opened_campaign.rename(columns = {'campaign_name' : 'nb_campaigns_opened' }, inplace = True)

    # Fusion des indicateurs
    campaigns_reduced = pd.merge(nb_campaigns, opened_campaign, on = 'customer_id', how = 'left')
    campaigns_reduced = pd.merge(campaigns_reduced, time_to_open, on = 'customer_id', how = 'left')

    # Remplir les NaN : nb_campaigns_opened
    campaigns_reduced['nb_campaigns_opened'].fillna(0)

    # Remplir les NaT : time_to_open (??)

    return campaigns_reduced
    

In [28]:
df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) 

In [29]:
df1_campaigns_kpi.head()

Unnamed: 0,customer_id,nb_campaigns,nb_campaigns_opened,time_to_open
0,2,4,0.0,NaT
1,3,222,124.0,1 days 00:28:30.169354838
2,4,7,7.0,1 days 04:31:01.428571428
3,5,4,0.0,NaT
4,6,20,0.0,NaT


## KPI tickets

In [30]:
df1_products_purchased_reduced['name_event_types'].unique()

array(['spectacle vivant', 'offre muséale individuel', 'formule adhésion',
       'offre muséale groupe'], dtype=object)

In [31]:
# Nombre de client assistant à plus de 2 type d'événement
nb_event_types = df1_products_purchased_reduced[['customer_id', 'name_event_types']].groupby('customer_id').nunique()

In [34]:
def tickets_kpi_function(tickets_information = None):

    tickets_information_copy = tickets_information.copy()

    # Dummy : Canal de vente en ligne
    liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
    tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int)

    # Proportion de vente en ligne
    prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id', 'event_type_id'])['ticket_id'].count().reset_index()
    prop_vente_internet.rename(columns = {'ticket_id' : 'nb_tickets_internet'}, inplace = True)
    
    tickets_kpi = (tickets_information_copy[['event_type_id', 'customer_id',  'purchase_id' ,'ticket_id','supplier_name', 'purchase_date', 'amount', 'vente_internet']]
                   .groupby(['customer_id', 'event_type_id']) 
                   .agg({'ticket_id': 'count', 
                         'purchase_id' : 'nunique',
                         'amount' : 'sum',
                         'supplier_name': 'nunique',
                         'vente_internet' : 'max',
                         'purchase_date' : ['min', 'max']})
                   .reset_index()
                  )
    
    tickets_kpi.columns = tickets_kpi.columns.map('_'.join)
    
    tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', 
                                  'purchase_id_nunique' : 'nb_purchases',
                                  'amount_sum' : 'total_amount',
                                  'supplier_name_nunique' : 'nb_suppliers', 
                                  'customer_id_' : 'customer_id',
                                  'event_type_id_' : 'event_type_id'}, inplace = True)
    
    tickets_kpi['time_between_purchase'] = tickets_kpi['purchase_date_max'] - tickets_kpi['purchase_date_min']
    tickets_kpi['time_between_purchase'] = tickets_kpi['time_between_purchase'] / np.timedelta64(1, 'D') # En nombre de jours

    # Convertir date et en chiffre
    max_date = tickets_kpi['purchase_date_max'].max()
    tickets_kpi['purchase_date_max'] = (max_date - tickets_kpi['purchase_date_max']) / np.timedelta64(1, 'D')
    tickets_kpi['purchase_date_min'] = (max_date - tickets_kpi['purchase_date_min']) / np.timedelta64(1, 'D')

    
    tickets_kpi = tickets_kpi.merge(prop_vente_internet, on = ['customer_id', 'event_type_id'], how = 'left')
    tickets_kpi['nb_tickets_internet'] = tickets_kpi['nb_tickets_internet'].fillna(0)

    
    
    return tickets_kpi
    

In [35]:
df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced)

In [36]:
df1_tickets_kpi.head()

Unnamed: 0,customer_id,event_type_id,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet
0,1,2,384226,194790,2686540.5,7,1,3262.190868,4.179306,3258.011562,51.0
1,1,4,453242,228945,3248965.5,6,1,3698.198229,5.22184,3692.976389,2988.0
2,1,5,201750,107110,1459190.0,6,1,3803.369792,0.146331,3803.223461,9.0
3,1,6,217356,111786,1435871.5,5,1,2502.715509,1408.715532,1093.999977,5.0
4,2,2,143,143,0.0,1,0,2041.274549,1340.30816,700.966389,0.0


In [37]:
# Exportation vers 'projet-bdc2324-team1'
BUCKET_OUT = "projet-bdc2324-team1"
FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Purchasing behaviour.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3

with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    df1_tickets_kpi.to_csv(file_out, index = False)

## Alexis' work

In [39]:
df1_tickets_kpi.head()

Unnamed: 0,customer_id,event_type_id,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet
0,1,2,384226,194790,2686540.5,7,1,3262.190868,4.179306,3258.011562,51.0
1,1,4,453242,228945,3248965.5,6,1,3698.198229,5.22184,3692.976389,2988.0
2,1,5,201750,107110,1459190.0,6,1,3803.369792,0.146331,3803.223461,9.0
3,1,6,217356,111786,1435871.5,5,1,2502.715509,1408.715532,1093.999977,5.0
4,2,2,143,143,0.0,1,0,2041.274549,1340.30816,700.966389,0.0


In [40]:
avg_amount =  (df1_products_purchased_reduced.groupby(["event_type_id", 'name_event_types'])
              .agg({"amount" : "mean"}).reset_index()
              .rename(columns = {'amount' : 'avg_amount'}))

avg_amount

Unnamed: 0,event_type_id,name_event_types,avg_amount
0,2,offre muséale individuel,6.150659
1,4,spectacle vivant,7.762474
2,5,offre muséale groupe,4.452618
3,6,formule adhésion,6.439463


In [41]:
df1_tickets_kpi = df1_tickets_kpi.merge(avg_amount, how='left', on= 'event_type_id')
df1_tickets_kpi.head()

Unnamed: 0,customer_id,event_type_id,nb_tickets,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,name_event_types,avg_amount
0,1,2,384226,194790,2686540.5,7,1,3262.190868,4.179306,3258.011562,51.0,offre muséale individuel,6.150659
1,1,4,453242,228945,3248965.5,6,1,3698.198229,5.22184,3692.976389,2988.0,spectacle vivant,7.762474
2,1,5,201750,107110,1459190.0,6,1,3803.369792,0.146331,3803.223461,9.0,offre muséale groupe,4.452618
3,1,6,217356,111786,1435871.5,5,1,2502.715509,1408.715532,1093.999977,5.0,formule adhésion,6.439463
4,2,2,143,143,0.0,1,0,2041.274549,1340.30816,700.966389,0.0,offre muséale individuel,6.150659


In [42]:
df1_customerplus_clean

Unnamed: 0,customer_id,birthdate,street_id,is_partner,gender,is_email_true,opt_in,structure_id,profession,language,...,fidelity,average_purchase_delay,average_price_basket,average_ticket_basket,total_price,purchase_count,first_buying_date,country,age,tenant_id
0,12751,,2,False,1,True,True,,,,...,0,,,,,0,NaT,fr,,1311
1,12825,,2,False,2,True,True,,,,...,0,,,,,0,NaT,fr,,1311
2,11261,,2,False,1,True,True,,,,...,0,,,,,0,NaT,fr,,1311
3,13071,,2,False,2,True,True,,,,...,0,,,,,0,NaT,fr,,1311
4,653061,,10,False,2,True,False,,,,...,0,,,,,0,NaT,,,1311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151861,295252,,10,False,2,True,False,,,,...,0,,,,,0,NaT,,,1311
151862,295271,,10,False,2,True,False,,,,...,0,,,,,0,NaT,,,1311
151863,295275,,10,False,2,True,False,,,,...,0,,,,,0,NaT,,,1311
151864,295366,,2,False,2,True,False,,,,...,1,3.0,33.0,3.0,33.0,1,2021-05-26 17:20:37+00:00,fr,,1311


In [43]:
## Add customer information
df1_customer = (df1_customerplus_clean.merge(df1_tickets_kpi, how = "left", on='customer_id')
                .sort_values(by='customer_id', ascending=True))
df1_customer.head()

Unnamed: 0,customer_id,birthdate,street_id,is_partner,gender,is_email_true,opt_in,structure_id,profession,language,...,nb_purchases,total_amount,nb_suppliers,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,name_event_types,avg_amount
59897,1,,2,False,2,True,False,,,,...,194790.0,2686540.5,7.0,1.0,3262.190868,4.179306,3258.011562,51.0,offre muséale individuel,6.150659
59900,1,,2,False,2,True,False,,,,...,111786.0,1435871.5,5.0,1.0,2502.715509,1408.715532,1093.999977,5.0,formule adhésion,6.439463
59898,1,,2,False,2,True,False,,,,...,228945.0,3248965.5,6.0,1.0,3698.198229,5.22184,3692.976389,2988.0,spectacle vivant,7.762474
59899,1,,2,False,2,True,False,,,,...,107110.0,1459190.0,6.0,1.0,3803.369792,0.146331,3803.223461,9.0,offre muséale groupe,4.452618
134695,2,,2,False,1,True,True,,,,...,164.0,0.0,1.0,0.0,1705.261192,1456.333715,248.927477,0.0,formule adhésion,6.439463


In [44]:
# Add campaigns information

df1_customer = df1_customer.merge(df1_campaigns_kpi, how='left', on='customer_id')
df1_customer.head()

Unnamed: 0,customer_id,birthdate,street_id,is_partner,gender,is_email_true,opt_in,structure_id,profession,language,...,vente_internet_max,purchase_date_min,purchase_date_max,time_between_purchase,nb_tickets_internet,name_event_types,avg_amount,nb_campaigns,nb_campaigns_opened,time_to_open
0,1,,2,False,2,True,False,,,,...,1.0,3262.190868,4.179306,3258.011562,51.0,offre muséale individuel,6.150659,,,NaT
1,1,,2,False,2,True,False,,,,...,1.0,2502.715509,1408.715532,1093.999977,5.0,formule adhésion,6.439463,,,NaT
2,1,,2,False,2,True,False,,,,...,1.0,3698.198229,5.22184,3692.976389,2988.0,spectacle vivant,7.762474,,,NaT
3,1,,2,False,2,True,False,,,,...,1.0,3803.369792,0.146331,3803.223461,9.0,offre muséale groupe,4.452618,,,NaT
4,2,,2,False,1,True,True,,,,...,0.0,1705.261192,1456.333715,248.927477,0.0,formule adhésion,6.439463,4.0,0.0,NaT


In [45]:
## Exportation

# Exportation vers 'projet-bdc2324-team1'
BUCKET_OUT = "projet-bdc2324-team1"
FILE_KEY_OUT_S3 = "0_Temp/Company 1 - customer_event.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3

with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    df1_customer.to_csv(file_out, index = False)

## End of Alexis' work

In [46]:
# Fusion avec KPI campaigns liés au customer
#df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')
#df1_customer.head()

In [None]:
df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')
print("shape : ", df1_customer_product.shape)
df1_customer_product.head()

In [None]:
df1_customer_product.to_csv("customer_product.csv", index = False)

# Fusion des bases locales

In [63]:
# Fusion avec KPI liés au customer
df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')

# Fill NaN values
df1_customer[['nb_campaigns', 'nb_campaigns_opened']] = df1_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)

In [64]:
df1_customer.head()

Unnamed: 0,customer_id,birthdate,street_id,is_partner,gender,is_email_true,opt_in,structure_id,profession,language,...,average_ticket_basket,total_price,purchase_count,first_buying_date,country,age,tenant_id,nb_campaigns,nb_campaigns_opened,time_to_open
0,12751,,2,False,1,True,True,,,,...,,,0,NaT,fr,,1311,0.0,0.0,NaT
1,12825,,2,False,2,True,True,,,,...,,,0,NaT,fr,,1311,0.0,0.0,NaT
2,11261,,2,False,1,True,True,,,,...,,,0,NaT,fr,,1311,0.0,0.0,NaT
3,13071,,2,False,2,True,True,,,,...,,,0,NaT,fr,,1311,0.0,0.0,NaT
4,653061,,10,False,2,True,False,,,,...,,,0,NaT,,,1311,80.0,2.0,0 days 19:53:02.500000


In [67]:
# Fusion avec KPI liés au comportement d'achat
df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer')

# Fill NaN values
df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)

In [66]:
df1_customer_product

Index(['customer_id', 'event_type_id', 'nb_tickets', 'nb_purchases',
       'total_amount', 'nb_suppliers', 'vente_internet_max',
       'purchase_date_min', 'purchase_date_max', 'time_between_purchase',
       'nb_tickets_internet', 'name_event_types', 'avg_amount', 'birthdate',
       'street_id', 'is_partner', 'gender', 'is_email_true', 'opt_in',
       'structure_id', 'profession', 'language', 'mcp_contact_id',
       'last_buying_date', 'max_price', 'ticket_sum', 'average_price',
       'fidelity', 'average_purchase_delay', 'average_price_basket',
       'average_ticket_basket', 'total_price', 'purchase_count',
       'first_buying_date', 'country', 'age', 'tenant_id', 'nb_campaigns',
       'nb_campaigns_opened', 'time_to_open'],
      dtype='object')

In [68]:
## Exportation

# Exportation vers 'projet-bdc2324-team1'
BUCKET_OUT = "projet-bdc2324-team1"
FILE_KEY_OUT_S3 = "1_Output/Company 1 - Segmentation base.csv"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3

with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    df1_customer_product.to_csv(file_out, index = False)