## Notebook Alexis

In [1]:
import pandas as pd
import os
import s3fs

In [2]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

BUCKET = "bdc2324-data"
fs.ls(BUCKET)

['bdc2324-data/1',
 'bdc2324-data/10',
 'bdc2324-data/101',
 'bdc2324-data/11',
 'bdc2324-data/12',
 'bdc2324-data/13',
 'bdc2324-data/14',
 'bdc2324-data/2',
 'bdc2324-data/3',
 'bdc2324-data/4',
 'bdc2324-data/5',
 'bdc2324-data/6',
 'bdc2324-data/7',
 'bdc2324-data/8',
 'bdc2324-data/9']

###  I. Analyse fichier 8

This section describes the databases associated with company 8. 

In [None]:
directory_path = '8'

In [None]:
# check the files in the directory

objects = fs.ls(f'{BUCKET}/{directory_path}')

for file in objects:
    print(file)

In [None]:
def display_databases(file_name):
    """
    This function returns the file from s3 storage
    """
    file_path = BUCKET + "/" + directory_path + "/" + file_name
    print("File path : ", file_path)
    with fs.open(file_path, mode="rb") as file_in:
        df = pd.read_csv(file_in, sep=",")
        
    print("Shape : ", df.shape)
    return df
    

#### Lookt at campaigns files

In [None]:
campaigns = display_databases("8campaigns.csv")
campaigns.head()

In [None]:
campaign_stats = display_databases("8campaign_stats.csv")
campaign_stats.head()

#### Look at links files

There is no links file for these company. Only the link_stats file

In [None]:
links_stats = display_databases("8link_stats.csv")
links_stats.head()

#### Analyse Customersplus file

In [None]:
file_name = "8customersplus.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
with fs.open(file_path, mode="rb") as file_in:
    customersplus = pd.read_csv(file_in, sep=",")

customersplus.head()

#### Analyse Structures files

In [None]:
file_name = "8structures.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        structures = pd.read_csv(file_in, sep=",")
except:
    print("No structures database")

For Stade Fran√ßais, there is no structures, tags and structure_tag_mapping databases

#### Analyze Target databases

In [None]:
file_name = "8customer_target_mappings.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        customer_targets = pd.read_csv(file_in, sep=",")
        
except:
    print("No such database in s3")

print("Shape : ", customer_targets.shape)
customer_targets.head()

In [None]:
file_name = "8targets.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        targets = pd.read_csv(file_in, sep=",")
        
except:
    print("No such database in s3")

print("Shape : ", targets.shape)
targets.head()

In [None]:
file_name = "8target_types.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        target_types = pd.read_csv(file_in, sep=",")
        
except:
    print("No such database in s3")

print("Shape : ", target_types.shape)
target_types.head()

#### Analyze consumption files

Meaning consumptions.csv, suppliers.csv, tickets.csv and purchases.csv

However, there is no consumptions.csv file

In [None]:
purchases = display_databases("8purchases.csv")
purchases.head()

In [None]:
tickets = display_databases("8tickets.csv")
tickets.head()

In [None]:
suppliers = display_databases("8suppliers.csv")
suppliers.head()

#### Analyse product file

In [None]:
products = display_databases("8products.csv")
products.head()

#### Analyze pricing files

Meaning pricing_formulas.csv and type_of_pricing_formulas

In [None]:
pricing_formulas = display_databases("8pricing_formulas.csv")
pricing_formulas.head()

In [None]:
type_pricing_formulas = display_databases("8type_of_pricing_formulas.csv")
type_pricing_formulas.head()

#### Analyze type of products

Meaning categories.csv, type_of_categories.csv

In [None]:
categories = display_databases("8categories.csv")
categories.head()

In [None]:
type_categories = display_databases("8type_of_categories.csv")
type_categories.head()

#### Analyze type of representations

Meaning representation_category_capacities.csv, representations.csv, representations_types.csv

however there is no representation_types database

In [None]:
representation_category_capacities = display_databases("8representation_category_capacities.csv")
representation_category_capacities.head()

In [None]:
representations = display_databases("8representations.csv")
representations.head()

In [None]:
#representation_type = display_databases("8representation_types.csv")

#### Analyze type of events

Meaning events.csv, event_types.csv, seasons.csv and facilities.csv

In [None]:
events = display_databases("8events.csv")
events.head()

In [None]:
event_types = display_databases("8event_types.csv")
event_types.head()

In [None]:
seasons = display_databases("8seasons.csv")
seasons.head()

In [None]:
facilities = display_databases("8facilities.csv")
facilities.head()

#### Analyze annexe databases

Meaning contributions.csv, contribution_sites.csv, currencies.csv, countries.csv and type_ofs.csc

## II. Identify Commons Datasets

From the analyze of the 8th company, we notice that some databases does not exist. Therefore, in order to construct a uniform database for all companies, we should first identify the common databases between all companies

In [18]:
## We first construct a dictionary reporting all the datasets for each companies

companies = fs.ls(BUCKET)
companies_database = {}

for company in companies:
    companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] 


In [24]:
# Then we create a list of all database

all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]
print("Number of databases : ",len(all_database))

Number of databases :  30


In [39]:
## We then create a set of database in common for all companies

data_in_common = set(all_database)

print(len(data_in_common))

for key in companies_database:
    diff_database = data_in_common.symmetric_difference(companies_database[key])
    data_in_common = data_in_common - diff_database

print(len(data_in_common))
    

30
23


## Create Universal database