In [114]:
import pandas as pd
import os
import s3fs
import warnings
from datetime import date, timedelta, datetime
import numpy as np

exec(open('../0_KPI_functions.py').read())

In [33]:
# Ignore warning
warnings.filterwarnings('ignore')

In [34]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

BUCKET = "bdc2324-data"
fs.ls(BUCKET)

['bdc2324-data/1',
 'bdc2324-data/10',
 'bdc2324-data/101',
 'bdc2324-data/11',
 'bdc2324-data/12',
 'bdc2324-data/13',
 'bdc2324-data/14',
 'bdc2324-data/2',
 'bdc2324-data/3',
 'bdc2324-data/4',
 'bdc2324-data/5',
 'bdc2324-data/6',
 'bdc2324-data/7',
 'bdc2324-data/8',
 'bdc2324-data/9']

## Look at the time sequence of each company and compute inter time coverage

In [73]:
sport = ['5', '6', '7', '8', '9']

In [90]:
def display_covering_time(df, company, datecover):
    """
    This function draws the time coverage of each company
    """
    min_date = df['purchase_date'].min().strftime("%Y-%m-%d")
    max_date = df['purchase_date'].max().strftime("%Y-%m-%d")
    datecover[company] = [datetime.strptime(min_date, "%Y-%m-%d") + timedelta(days=x) for x in range((datetime.strptime(max_date, "%Y-%m-%d") - datetime.strptime(min_date, "%Y-%m-%d")).days)]
    print(f'Couverture Company {company} : {min_date} - {max_date}')
    return datecover

In [91]:
def compute_time_intersection(datecover):
    timestamps_sets = [set(timestamps) for timestamps in datecover.values()]
    intersection = set.intersection(*timestamps_sets)
    intersection_list = list(intersection)
    formated_dates = [dt.strftime("%Y-%m-%d") for dt in intersection_list]
    return sorted(formated_dates)

In [93]:
def df_coverage_modelization(sport, coverage_train = 0.7):
    """
    This function returns start_date, end_of_features and final dates
    that help to construct train and test datasets
    """
    datecover = {}
    for company in sport:
        df_products_purchased_reduced = display_databases(company, file_name = "products_purchased_reduced",
                                                          datetime_col = ['purchase_date'])
        datecover = display_covering_time(df_products_purchased_reduced, company, datecover)
    #print(datecover.keys())
    dt_coverage = compute_time_intersection(datecover)
    start_date = dt_coverage[0]
    end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]
    final_date = dt_coverage[-1]
    return start_date, end_of_features, final_date
    

In [94]:
start_date, end_of_features, final_date = df_coverage_modelization(sport, coverage_train = 0.7)
print(start_date, end_of_features, final_date )

File path :  projet-bdc2324-team1/0_Input/Company_5/products_purchased_reduced.csv
Couverture Company 5 : 2019-04-15 - 2023-11-09
File path :  projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv
Couverture Company 6 : 2018-06-28 - 2023-11-08
File path :  projet-bdc2324-team1/0_Input/Company_7/products_purchased_reduced.csv
Couverture Company 7 : 2015-02-10 - 2023-11-08
File path :  projet-bdc2324-team1/0_Input/Company_8/products_purchased_reduced.csv
Couverture Company 8 : 2010-09-28 - 2023-11-08
File path :  projet-bdc2324-team1/0_Input/Company_9/products_purchased_reduced.csv
Couverture Company 9 : 2014-09-22 - 2023-10-24
dict_keys(['5', '6', '7', '8', '9'])
2019-04-15 2022-06-15 2023-10-23


## Look at common database between Sport companies

In [101]:
companies = fs.ls(BUCKET)
companies = [company for company in companies if any(company.endswith(end) for end in sport)]
companies

['bdc2324-data/5',
 'bdc2324-data/6',
 'bdc2324-data/7',
 'bdc2324-data/8',
 'bdc2324-data/9']

In [107]:
companies_database = {}

for company in companies:
    companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] 

all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]
print("Number of databases : ",len(all_database))

data_in_common = set(all_database)

for key in companies_database:
    diff_database = data_in_common.symmetric_difference(companies_database[key])
    data_in_common = data_in_common - diff_database

print("Number of common databases : ",len(data_in_common))

Number of databases :  30
Number of common databases :  23


In [121]:
data_in_common

{'campaign_stats.csv',
 'campaigns.csv',
 'categories.csv',
 'countries.csv',
 'currencies.csv',
 'customer_target_mappings.csv',
 'customersplus.csv',
 'event_types.csv',
 'events.csv',
 'facilities.csv',
 'link_stats.csv',
 'pricing_formulas.csv',
 'product_packs.csv',
 'products.csv',
 'products_groups.csv',
 'purchases.csv',
 'representation_category_capacities.csv',
 'representations.csv',
 'seasons.csv',
 'suppliers.csv',
 'target_types.csv',
 'targets.csv',
 'tickets.csv'}

## Investigate errors from data construction for company 6

In [108]:
directory_path = '6'

In [143]:
df_customerplus_clean = display_databases(directory_path, file_name = "customerplus_cleaned")
df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])

File path :  projet-bdc2324-team1/0_Input/Company_6/customerplus_cleaned.csv
File path :  projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv
File path :  projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv


In [144]:
max_date =  pd.to_datetime(final_date, utc = True, format = 'ISO8601') 
end_features_date = pd.to_datetime(end_of_features, utc = True, format = 'ISO8601')
min_date = pd.to_datetime(start_date, utc = True, format = 'ISO8601')

In [128]:
df_campaigns_information = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])

File path :  projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv


In [133]:
end_features_date

Timestamp('2022-06-15 00:00:00+0000', tz='UTC')

In [136]:
print("Shape campaigns_information : ", df_campaigns_information.shape)
df_campaigns_information.head()

Shape campaigns_information :  (1333010, 8)


Unnamed: 0,id,customer_id,opened_at,sent_at,delivered_at,campaign_name,campaign_service_id,campaign_sent_at
0,1,38,NaT,2022-08-02 18:31:33+00:00,,Adhérents non ré-engagés,15,2022-08-02 18:31:36+00:00
1,2,26135,NaT,2022-08-02 18:31:34+00:00,,Adhérents non ré-engagés,15,2022-08-02 18:31:36+00:00
2,3,3876,NaT,2022-08-02 18:31:35+00:00,,Adhérents non ré-engagés,15,2022-08-02 18:31:36+00:00
3,4,26226,NaT,2022-08-02 18:31:35+00:00,,Adhérents non ré-engagés,15,2022-08-02 18:31:36+00:00
4,5,25349,NaT,2022-08-02 18:31:34+00:00,,Adhérents non ré-engagés,15,2022-08-02 18:31:36+00:00


In [134]:
df_campaigns_information['sent_at'].min()

Timestamp('2022-08-02 18:31:33+0000', tz='UTC')

In [137]:
df_campaigns_information['sent_at'].max()

Timestamp('2023-11-07 10:08:16+0000', tz='UTC')

In [127]:
#Filtre de la base df_campaigns_information
df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]
df_campaigns_information

Unnamed: 0,id,customer_id,opened_at,sent_at,delivered_at,campaign_name,campaign_service_id,campaign_sent_at


In [145]:
#Filtre de la base df_products_purchased_reduced
df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]
df_products_purchased_reduced.head()

Unnamed: 0,ticket_id,customer_id,purchase_id,event_type_id,supplier_name,purchase_date,amount,is_full_price,name_event_types,name_facilities,name_categories,name_events,name_seasons
49,91401,108392,1259025.0,4,caisse,2022-02-27 13:44:10.690000+00:00,0.0,False,ligue 1 uber eats,stade de l'aube,honneur basse,olympique de marseille,saison 2021-2022
117,535527,31304,136629.0,4,adhésion,2022-04-28 15:47:52.790000+00:00,0.0,False,ligue 1 uber eats,stade de l'aube,honneur basse,ac ajaccio,saison 2022-2023
274,547400,192,140477.0,4,adhésion,2022-04-28 15:47:54.053000+00:00,0.0,False,ligue 1 uber eats,stade de l'aube,honneur basse,rc strasbourg,saison 2022-2023
304,84413,31388,20259.0,4,adhésion,2021-08-03 13:45:01.603000+00:00,0.0,False,ligue 1 uber eats,stade de l'aube,vitoux haute,olympique de marseille,saison 2021-2022
311,407271,3265,90527.0,4,web [adhésion],2022-05-26 09:15:40.993000+00:00,0.0,False,ligue 1 uber eats,stade de l'aube,champagne basse,stade brestois 29,saison 2022-2023


In [150]:
df_products_purchased_reduced["supplier_name"].unique()

array(['caisse', 'adhésion', 'web [adhésion]', 'web [grand public]',
       'itr ticketmaster', 'itr fnac', nan, 'decathlon', 'boutique web',
       'boutique officielle'], dtype=object)

In [151]:
# KPI sur le comportement d'achat
tickets_information_copy = df_products_purchased_reduced.copy()
# Dummy : Canal de vente en ligne
liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance
tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)

tickets_information_copy['vente_internet'] corrected by handling na