## Notebook Alexis

In [None]:
import pandas as pd
import os
import s3fs

In [None]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

BUCKET = "bdc2324-data"
fs.ls(BUCKET)

###  I. Analyse fichier 8

This section describes the databases associated with company 8. 

In [None]:
directory_path = '8'

In [None]:
# check the files in the directory

objects = fs.ls(f'{BUCKET}/{directory_path}')

for file in objects:
    print(file)

In [None]:
def display_databases(file_name):
    """
    This function returns the file from s3 storage
    """
    file_path = BUCKET + "/" + directory_path + "/" + file_name
    print("File path : ", file_path)
    with fs.open(file_path, mode="rb") as file_in:
        df = pd.read_csv(file_in, sep=",")
        
    print("Shape : ", df.shape)
    return df
    

#### Lookt at campaigns files

In [None]:
campaigns = display_databases("8campaigns.csv")
campaigns.head()

In [None]:
campaign_stats = display_databases("8campaign_stats.csv")
campaign_stats.head()

#### Look at links files

There is no links file for these company. Only the link_stats file

In [None]:
links_stats = display_databases("8link_stats.csv")
links_stats.head()

#### Analyse Customersplus file

In [None]:
file_name = "8customersplus.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
with fs.open(file_path, mode="rb") as file_in:
    customersplus = pd.read_csv(file_in, sep=",")

customersplus.head()

#### Analyse Structures files

In [None]:
file_name = "8structures.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        structures = pd.read_csv(file_in, sep=",")
except:
    print("No structures database")

For Stade Fran√ßais, there is no structures, tags and structure_tag_mapping databases

#### Analyze Target databases

In [None]:
file_name = "8customer_target_mappings.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        customer_targets = pd.read_csv(file_in, sep=",")
        
except:
    print("No such database in s3")

print("Shape : ", customer_targets.shape)
customer_targets.head()

In [None]:
file_name = "8targets.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        targets = pd.read_csv(file_in, sep=",")
        
except:
    print("No such database in s3")

print("Shape : ", targets.shape)
targets.head()

In [None]:
file_name = "8target_types.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        target_types = pd.read_csv(file_in, sep=",")
        
except:
    print("No such database in s3")

print("Shape : ", target_types.shape)
target_types.head()

#### Analyze consumption files

Meaning consumptions.csv, suppliers.csv, tickets.csv and purchases.csv

However, there is no consumptions.csv file

In [None]:
purchases = display_databases("8purchases.csv")
purchases.head()

In [None]:
tickets = display_databases("8tickets.csv")
tickets.head()

In [None]:
suppliers = display_databases("8suppliers.csv")
suppliers.head()

#### Analyse product file

In [None]:
products = display_databases("8products.csv")
products.head()

#### Analyze pricing files

Meaning pricing_formulas.csv and type_of_pricing_formulas

In [None]:
pricing_formulas = display_databases("8pricing_formulas.csv")
pricing_formulas.head()

In [None]:
type_pricing_formulas = display_databases("8type_of_pricing_formulas.csv")
type_pricing_formulas.head()

#### Analyze type of products

Meaning categories.csv, type_of_categories.csv

In [None]:
categories = display_databases("8categories.csv")
categories.head()

In [None]:
type_categories = display_databases("8type_of_categories.csv")
type_categories.head()

#### Analyze type of representations

Meaning representation_category_capacities.csv, representations.csv, representations_types.csv

however there is no representation_types database

In [None]:
representation_category_capacities = display_databases("8representation_category_capacities.csv")
representation_category_capacities.head()

In [None]:
representations = display_databases("8representations.csv")
representations.head()

In [None]:
#representation_type = display_databases("8representation_types.csv")

#### Analyze type of events

Meaning events.csv, event_types.csv, seasons.csv and facilities.csv

In [None]:
events = display_databases("8events.csv")
events.head()

In [None]:
event_types = display_databases("8event_types.csv")
event_types.head()

In [None]:
seasons = display_databases("8seasons.csv")
seasons.head()

In [None]:
facilities = display_databases("8facilities.csv")
facilities.head()

#### Analyze annexe databases

Meaning contributions.csv, contribution_sites.csv, currencies.csv, countries.csv and type_ofs.csc

## II. Identify Commons Datasets

From the analyze of the 8th company, we notice that some databases does not exist. Therefore, in order to construct a uniform database for all companies, we should first identify the common databases between all companies

In [None]:
## We first construct a dictionary reporting all the datasets for each companies

companies = fs.ls(BUCKET)
companies_database = {}

for company in companies:
    companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] 


In [None]:
# Then we create a list of all database

all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]
print("Number of databases : ",len(all_database))

In [None]:
## We then create a set of database in common for all companies

data_in_common = set(all_database)

print(len(data_in_common))

for key in companies_database:
    diff_database = data_in_common.symmetric_difference(companies_database[key])
    data_in_common = data_in_common - diff_database

print(len(data_in_common))
    

## Create Universal database

We will first create a procedure to clean the datasets of a company and then merge them. Hence, we will be able to replicate this procedure for all companies and create a universal database.

Let's first create our procedure for the company 1 and the datasets belongings to the theme producst

In [None]:
directory_path = '1'

In [None]:
theme_products = ["products.csv" ,"categories.csv", "type_of_categories.csv"]

In [None]:
def remove_horodates(df):
    """
    this function remove horodate columns like created_at and updated_at
    """
    df = df.drop(columns = ["created_at", "updated_at"])
    return df

In [None]:
def order_columns_id(df):
    """
    this function puts all id columns at the beginning in order to read the dataset easier
    """
    substring = 'id'
    id_columns = [col for col in df.columns if substring in col]
    remaining_col = [col for col in df.columns if substring not in col]
    new_order = id_columns + remaining_col
    return df[new_order]

In [None]:
def percent_na(df):
    """
    this function returns the percentage of na for each column
    """
    percent_missing = df.isna().sum() * 100 / len(df)
    return percent_missing

In [None]:
def process_df(df):
    df = remove_horodates(df)
    print("Number of columns : ", len(df.columns))
    df = order_columns_id(df)
    print("Columns : ", df.columns)
    print("Percent of NA for each column : ", percent_na(df))
    return df

#### Deep analysis of products.csv

In [None]:
products = display_databases("1products.csv")
print("Number of columns : ", len(products.columns))
products.head()

In [None]:
products = remove_horodates(products)
print("Number of columns : ", len(products.columns))
products = order_columns_id(products)
print("Columns : ", products.columns)
products.head()

In [None]:
print(products.dtypes)

In [None]:
percent_missing = products.isna().sum() * 100 / len(products)
print(percent_missing)

#### Deep analysis of categories.csv

In [None]:
name_dataset = '1categories.csv'

In [None]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

In [None]:
df = process_df(df)
df.head()

In [None]:
df.dtypes

#### Deep analysis of type_of_categories.csv

#### Deep analysis of representation_category_capacities.csv

#### Deep analysis of representations.csv

#### Deep analysis of events.csv

In [None]:
name_dataset = '1events.csv'

In [None]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

In [None]:
df = process_df(df)
df.head()

In [None]:
df.dtypes

#### Deep analysis of event_types.csv

In [None]:
name_dataset = '1event_types.csv'

In [None]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

In [None]:
df = process_df(df)
df.head()

In [None]:
df.dtypes

#### Deep analysis of seasons.csv

In [None]:
name_dataset = '1seasons.csv'

In [None]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

In [None]:
df = process_df(df)
df.head()

In [None]:
df.dtypes

#### Deep Analysis of facilities.csv

In [None]:
name_dataset = '1facilities.csv'

In [None]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

In [None]:
df = process_df(df)
df.head()

In [None]:
df.dtypes

## Merge

In [None]:
def process_df_2(df):
    df = remove_horodates(df)
    print("Number of columns : ", len(df.columns))
    df = order_columns_id(df)
    print("Columns : ", df.columns)
    return df

In [None]:
def load_dataset(name):
    df = display_databases(name)
    df = process_df_2(df)
    # drop na :
    #df = df.dropna(axis=1, thresh=len(df))
    # if identifier in table : delete it
    if 'identifier' in df.columns:
        df = df.drop(columns = 'identifier')
    return df

### Products Table

In [None]:
def create_products_table():
    # first merge products and categories
    print("first merge products and categories")
    products = load_dataset("1products.csv")
    categories = load_dataset("1categories.csv")
    products_theme = products.merge(categories, how = 'left', left_on = 'category_id',
                                    right_on = 'id', suffixes=('_products', '_categories'))
    products_theme = products_theme.rename(columns = {"name" : "name_categories"})
    # Second merge products_theme and type of categories
    print("Second merge products_theme and type of categories")
    type_of_categories = load_dataset("1type_of_categories.csv")
    type_of_categories = type_of_categories.drop(columns = 'id')
    products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
                                          right_on = 'category_id' )

    # Index cleaning
    products_theme = products_theme.drop(columns = ['id_categories'])
    products_theme  = order_columns_id(products_theme)

    return products_theme

In [None]:
products_theme = create_products_table()
products_theme.head()

### Events Table

In [None]:
def create_events_table():
    # first merge events and seasons : 
    print("first merge events and seasons : ")
    events = load_dataset("1events.csv")
    seasons = load_dataset("1seasons.csv")
    events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))

    # Secondly merge events_theme and event_types
    print("Secondly merge events_theme and event_types : ")
    event_types = load_dataset("1event_types.csv")

    events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))
    events_theme = events_theme.rename(columns = {"name" : "name_event_types"})
    events_theme = events_theme.drop(columns = 'id')

    # thirdly merge events_theme and facilities
    print("thirdly merge events_theme and facilities : ")
    facilities = load_dataset("1facilities.csv")
    events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))
    events_theme = events_theme.rename(columns = {"name" : "name_facilties"})
    events_theme = events_theme.drop(columns = 'id')

    # Index cleaning
    events_theme = events_theme.drop(columns = ['id_seasons'])
    events_theme  = order_columns_id(events_theme)
    return events_theme

In [None]:
events_theme= create_events_table()
events_theme.head()

## Representations_Table

In [None]:
def create_representations_table():
    representations = load_dataset("1representations.csv")
    representations_capacity = load_dataset("1representation_category_capacities.csv")

    representations_theme = representations.merge(representations_capacity, how='left',
                                                  left_on='id', right_on='representation_id',
                                                  suffixes=('_representation', '_representation_cap'))
    # index cleaning
    representations_theme = representations_theme.drop(columns = ["representation_id"])
    representations_theme = order_columns_id(representations_theme)
    return representations_theme

In [None]:
rep = create_representations_table()
rep.head()