## Notebook Alexis

In [1]:
import pandas as pd
import os
import s3fs

In [2]:
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

BUCKET = "bdc2324-data"
fs.ls(BUCKET)

['bdc2324-data/1',
 'bdc2324-data/10',
 'bdc2324-data/101',
 'bdc2324-data/11',
 'bdc2324-data/12',
 'bdc2324-data/13',
 'bdc2324-data/14',
 'bdc2324-data/2',
 'bdc2324-data/3',
 'bdc2324-data/4',
 'bdc2324-data/5',
 'bdc2324-data/6',
 'bdc2324-data/7',
 'bdc2324-data/8',
 'bdc2324-data/9']

###  I. Analyse fichier 8

This section describes the databases associated with company 8. 

In [3]:
directory_path = '8'

In [4]:
# check the files in the directory

objects = fs.ls(f'{BUCKET}/{directory_path}')

for file in objects:
    print(file)

bdc2324-data/8/8campaign_stats.csv
bdc2324-data/8/8campaigns.csv
bdc2324-data/8/8categories.csv
bdc2324-data/8/8countries.csv
bdc2324-data/8/8currencies.csv
bdc2324-data/8/8customer_target_mappings.csv
bdc2324-data/8/8customersplus.csv
bdc2324-data/8/8event_types.csv
bdc2324-data/8/8events.csv
bdc2324-data/8/8facilities.csv
bdc2324-data/8/8link_stats.csv
bdc2324-data/8/8pricing_formulas.csv
bdc2324-data/8/8product_packs.csv
bdc2324-data/8/8products.csv
bdc2324-data/8/8products_groups.csv
bdc2324-data/8/8purchases.csv
bdc2324-data/8/8representation_category_capacities.csv
bdc2324-data/8/8representations.csv
bdc2324-data/8/8seasons.csv
bdc2324-data/8/8suppliers.csv
bdc2324-data/8/8target_types.csv
bdc2324-data/8/8targets.csv
bdc2324-data/8/8tickets.csv
bdc2324-data/8/8type_of_categories.csv
bdc2324-data/8/8type_of_pricing_formulas.csv
bdc2324-data/8/8type_ofs.csv


In [5]:
def display_databases(file_name):
    """
    This function returns the file from s3 storage
    """
    file_path = BUCKET + "/" + directory_path + "/" + file_name
    print("File path : ", file_path)
    with fs.open(file_path, mode="rb") as file_in:
        df = pd.read_csv(file_in, sep=",")
        
    print("Shape : ", df.shape)
    return df
    

#### Lookt at campaigns files

In [6]:
campaigns = display_databases("8campaigns.csv")
campaigns.head()

File path :  bdc2324-data/8/8campaigns.csv
Shape :  (1689, 11)


Unnamed: 0,id,name,service_id,created_at,updated_at,process_id,report_url,category,to_be_synced,identifier,sent_at
0,1,#LOUSFP RELANCE P'TITS LOU,1436,2022-02-01 15:22:53.564432+01:00,2022-02-01 15:22:53.564432+01:00,,,0,False,eaa32c96f620053cf442ad32258076b9,2022-01-31 00:00:00+01:00
1,2,#LOUSFP BRASSERIE ACHETEURS,1435,2022-02-01 15:22:53.572592+01:00,2022-02-01 15:22:53.572592+01:00,,,0,False,1f3202d820180a39f736f20fce790de8,2022-01-31 00:00:00+01:00
2,3,PRESSE. LOU/SF Paris - RDV et protocole,1433,2022-02-01 15:22:53.578426+01:00,2022-02-01 15:22:53.578426+01:00,,,0,False,b069b3415151fa7217e870017374de7c,2022-01-31 00:00:00+01:00
3,4,#LOUSFP ÉTUDIANTS,1432,2022-02-01 15:22:53.584235+01:00,2022-02-01 15:22:53.584235+01:00,,,0,False,56468d5607a5aaf1604ff5e15593b003,2022-01-27 00:00:00+01:00
4,5,#LOUSFP P'TITS LOU,1431,2022-02-01 15:22:53.590187+01:00,2022-02-01 15:22:53.590187+01:00,,,0,False,e11943a6031a0e6114ae69c257617980,2022-01-27 00:00:00+01:00


In [7]:
campaign_stats = display_databases("8campaign_stats.csv")
campaign_stats.head()

File path :  bdc2324-data/8/8campaign_stats.csv
Shape :  (2527083, 8)


Unnamed: 0,id,campaign_id,customer_id,opened_at,sent_at,delivered_at,created_at,updated_at
0,1,5,161410,2022-02-02 18:16:07+01:00,,,2022-02-02 17:16:08.616899+01:00,2022-02-02 17:16:08.623098+01:00
1,2,1,54228,2022-02-02 18:18:11+01:00,,,2022-02-02 17:18:12.030260+01:00,2022-02-02 17:18:12.036606+01:00
2,3,6,120794,2022-02-02 18:18:58+01:00,,,2022-02-02 17:19:00.129697+01:00,2022-02-02 17:19:00.134704+01:00
3,4,3,467025,2022-02-02 18:19:33+01:00,,,2022-02-02 17:19:34.023492+01:00,2022-02-02 17:19:34.027570+01:00
4,5,2,142106,2022-02-02 18:19:35+01:00,,,2022-02-02 17:19:36.553321+01:00,2022-02-02 17:19:36.557473+01:00


#### Look at links files

There is no links file for these company. Only the link_stats file

In [8]:
links_stats = display_databases("8link_stats.csv")
links_stats.head()

File path :  bdc2324-data/8/8link_stats.csv
Shape :  (108461, 6)


Unnamed: 0,id,clicked_at,link_id,customer_id,created_at,updated_at
0,1,2022-02-02 18:33:17+01:00,1,62137,2022-02-02 17:33:19.237759+01:00,2022-02-02 17:33:19.237759+01:00
1,2,2022-02-02 18:33:26+01:00,1,556048,2022-02-02 17:33:28.101943+01:00,2022-02-02 17:33:28.101943+01:00
2,3,2022-02-02 18:33:49+01:00,2,194456,2022-02-02 17:33:50.595125+01:00,2022-02-02 17:33:50.595125+01:00
3,4,2022-02-02 18:34:19+01:00,1,194456,2022-02-02 17:34:20.493986+01:00,2022-02-02 17:34:20.493986+01:00
4,5,2022-02-02 18:34:21+01:00,2,21571,2022-02-02 17:34:22.300427+01:00,2022-02-02 17:34:22.300427+01:00


#### Analyse Customersplus file

In [9]:
file_name = "8customersplus.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
with fs.open(file_path, mode="rb") as file_in:
    customersplus = pd.read_csv(file_in, sep=",")

customersplus.head()

bdc2324-data/8/8customersplus.csv


  customersplus = pd.read_csv(file_in, sep=",")


Unnamed: 0,id,lastname,firstname,birthdate,email,street_id,created_at,updated_at,civility,is_partner,...,preferred_category,preferred_supplier,preferred_formula,purchase_count,first_buying_date,last_visiting_date,zipcode,country,age,tenant_id
0,1411166,,,,email1411166,1,2022-12-19 15:03:39.419371+01:00,2022-12-19 15:03:39.419371+01:00,,False,...,,,,0,,,,fr,,1594
1,478498,lastname478498,firstname478498,,email478498,339167,2021-09-17 18:58:30.259053+02:00,2023-06-28 15:25:24.146689+02:00,,False,...,,,,0,,,,,,1594
2,473678,,,,email473678,339167,2021-09-17 18:44:04.119713+02:00,2021-09-17 18:44:04.124204+02:00,,False,...,,,,0,,,,,,1594
3,475026,,,,email475026,339167,2021-09-17 18:47:28.789618+02:00,2021-09-17 18:47:28.793958+02:00,,False,...,,,,0,,,,,,1594
4,487146,,,,email487146,339167,2021-09-17 19:10:24.070460+02:00,2021-09-17 19:10:24.076033+02:00,,False,...,,,,0,,,,,,1594


#### Analyse Structures files

In [10]:
file_name = "8structures.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        structures = pd.read_csv(file_in, sep=",")
except:
    print("No structures database")

bdc2324-data/8/8structures.csv
No structures database


For Stade Français, there is no structures, tags and structure_tag_mapping databases

#### Analyze Target databases

In [11]:
file_name = "8customer_target_mappings.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        customer_targets = pd.read_csv(file_in, sep=",")
        
except:
    print("No such database in s3")

print("Shape : ", customer_targets.shape)
customer_targets.head()

bdc2324-data/8/8customer_target_mappings.csv
Shape :  (1449147, 7)


Unnamed: 0,id,customer_id,target_id,created_at,updated_at,name,extra_field
0,1,460062,68,2021-09-17 20:20:24.562734+02:00,2021-09-17 20:20:24.562734+02:00,,
1,2,460056,68,2021-09-17 20:20:24.610139+02:00,2021-09-17 20:20:24.610139+02:00,,
2,3,460051,65,2021-09-17 20:20:24.641381+02:00,2021-09-17 20:20:24.641381+02:00,,
3,4,460051,66,2021-09-17 20:20:24.672238+02:00,2021-09-17 20:20:24.672238+02:00,,
4,5,460049,71,2021-09-17 20:20:24.703110+02:00,2021-09-17 20:20:24.703110+02:00,,


In [12]:
file_name = "8targets.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        targets = pd.read_csv(file_in, sep=",")
        
except:
    print("No such database in s3")

print("Shape : ", targets.shape)
targets.head()

bdc2324-data/8/8targets.csv
Shape :  (331, 5)


Unnamed: 0,id,target_type_id,name,created_at,updated_at
0,1,1,ÉTUDIANTS (OPÉ PANIERS) 21-22,2021-09-17 18:10:40.879995+02:00,2021-09-17 18:10:40.879995+02:00
1,2,1,EFFECTIF + STAFF 21-22,2021-09-17 18:10:40.894758+02:00,2021-09-17 18:10:40.894758+02:00
2,3,1,Acheteurs LOU / USAP,2021-09-17 18:10:40.911969+02:00,2021-09-17 18:10:40.911969+02:00
3,4,1,Liste Compensation 21-22,2021-09-17 18:10:40.928796+02:00,2021-09-17 18:10:40.928796+02:00
4,5,1,Partenaires 21-22,2021-09-17 18:10:40.945476+02:00,2021-09-17 18:10:40.945476+02:00


In [13]:
file_name = "8target_types.csv"
file_path = BUCKET + "/" + directory_path + "/" + file_name
print(file_path)
try:
    with fs.open(file_path, mode="rb") as file_in:
        target_types = pd.read_csv(file_in, sep=",")
        
except:
    print("No such database in s3")

print("Shape : ", target_types.shape)
target_types.head()

bdc2324-data/8/8target_types.csv
Shape :  (4, 6)


Unnamed: 0,id,is_import,name,created_at,updated_at,identifier
0,1,,manual_static_filter,2021-09-17 18:10:40.864320+02:00,2021-09-17 18:10:40.864320+02:00,e34e3aa838a6eb4c41df6ed4444b796a
1,2,False,manual_dynamic_filter,2022-03-09 14:41:45.695407+01:00,2022-03-09 14:41:45.695407+01:00,e0f4b8693184850fefd6d2a38f10584e
2,3,False,manual_static_filter,2022-04-01 17:02:49.588910+02:00,2022-04-01 17:02:49.588910+02:00,fb27e81baa4debc6a4e1a8639c20e808
3,4,True,manual_import,2022-05-06 14:26:01.923160+02:00,2022-05-06 14:26:01.923160+02:00,12213df2ce68a624e4c0070521437bac


#### Analyze consumption files

Meaning consumptions.csv, suppliers.csv, tickets.csv and purchases.csv

However, there is no consumptions.csv file

In [14]:
purchases = display_databases("8purchases.csv")
purchases.head()

File path :  bdc2324-data/8/8purchases.csv
Shape :  (975703, 7)


Unnamed: 0,id,purchase_date,customer_id,created_at,updated_at,number,identifier
0,119609,2017-09-09 15:39:45.913000+02:00,1149,2021-06-29 21:52:21.816195+02:00,2021-06-29 21:52:21.816195+02:00,193416,f2956e2d53321317e7c15c1cb992156c
1,119610,2017-09-09 15:39:46.033000+02:00,1149,2021-06-29 21:52:21.817846+02:00,2021-06-29 21:52:21.817846+02:00,193416,faabab441b2668a85bb484490b2166c3
2,5464,2017-07-24 19:44:11.923000+02:00,1251,2021-06-29 21:33:45.604224+02:00,2021-06-29 21:33:45.604224+02:00,184354,f63c69fa585ce4f91681f0d9ebeb770f
3,119613,2017-09-10 11:25:45.820000+02:00,12558,2021-06-29 21:52:21.822033+02:00,2021-06-29 21:52:21.822033+02:00,193462,ffce5fd8d2348eb6885d0ee9c7bd017c
4,1422860,2018-10-08 10:30:42.980000+02:00,17935,2021-07-16 04:20:55.347369+02:00,2021-07-16 04:20:55.347369+02:00,247459,193e41eae8ee078537107a569c0426ef


In [15]:
tickets = display_databases("8tickets.csv")
tickets.head()

File path :  bdc2324-data/8/8tickets.csv
Shape :  (2370152, 11)


Unnamed: 0,id,number,created_at,updated_at,purchase_id,product_id,is_from_subscription,type_of,supplier_id,barcode,identifier
0,254164,193416_763837_650_688_326212,2021-06-29 21:53:14.951871+02:00,2021-06-29 21:53:14.951871+02:00,119609,3334,False,1,2,,9ec3b5617fc54512acf131aa5fa26870
1,254165,193416_763838_650_688_326236,2021-06-29 21:53:14.953717+02:00,2021-06-29 21:53:14.953717+02:00,119610,3334,False,1,2,,b227c664e2574a919672683f5cc4c98e
2,254168,193462_763921_649_687_305676,2021-06-29 21:53:14.958207+02:00,2021-06-29 21:53:14.958207+02:00,119613,3432,False,1,2,,28ac507ad84a30993bdfc0996fd2476b
3,254169,193462_763922_649_687_305653,2021-06-29 21:53:14.959681+02:00,2021-06-29 21:53:14.959681+02:00,119614,3268,False,1,2,,131dbaeef23f5ac2271bf0266ce35476
4,254170,193462_763923_649_687_305630,2021-06-29 21:53:14.961157+02:00,2021-06-29 21:53:14.961157+02:00,119615,3268,False,1,2,,1a6342ad2c213b626aa55e5374cd661a


In [16]:
suppliers = display_databases("8suppliers.csv")
suppliers.head()

File path :  bdc2324-data/8/8suppliers.csv
Shape :  (16, 9)


Unnamed: 0,id,name,manually_added,label,itr,updated_at,created_at,commission,identifier
0,152,plateformeceweb,False,,,2021-07-16 00:02:17.805193+02:00,2021-07-16 00:02:17.805193+02:00,,0fc934f49bfa9f1f4e6ab7e2593b6839
1,6,accreditation annuelle,False,,,2021-06-29 21:33:14.138349+02:00,2021-06-29 21:33:14.138349+02:00,,fe13238540e0ff293ec8aad29aeae6c3
2,68,abonnement parking,False,,,2021-06-29 22:10:31.167367+02:00,2021-06-29 22:10:31.167367+02:00,,0f7defc52a97cdca533af74f4e6e5b1e
3,9,accreditation match,False,,,2021-06-29 21:33:14.142084+02:00,2021-06-29 21:33:14.142084+02:00,,40e19a7c4824eaad298e0107ed7e3691
4,154,web lnr-lou,False,,,2021-07-16 00:02:17.806521+02:00,2021-07-16 00:02:17.806521+02:00,,b144dd617807b02e0d9002fac6c61768


#### Analyse product file

In [17]:
products = display_databases("8products.csv")
products.head()

File path :  bdc2324-data/8/8products.csv
Shape :  (45411, 14)


Unnamed: 0,id,amount,is_full_price,representation_id,pricing_formula_id,created_at,updated_at,category_id,apply_price,products_group_id,product_pack_id,extra_field,amount_consumption,identifier
0,90013,0.0,False,1961,912,2021-07-16 04:56:05.797551+02:00,2021-07-16 04:56:05.797551+02:00,34,0.0,87917,1,,,476e111175b1660688b7c13dade2b57e
1,662,0.0,False,11,29,2021-06-29 21:33:17.389201+02:00,2021-06-29 21:33:17.389201+02:00,16,0.0,640,1,,,2c765698e9bedd48e8a3fd27dc8dbc97
2,646,0.0,False,46,10,2021-06-29 21:33:17.366742+02:00,2021-06-29 21:33:17.366742+02:00,15,0.0,624,1,,,4e719148651fd7f175e3fb51bdb5d31b
3,5703,5.0,False,7,188,2021-06-29 21:52:09.374365+02:00,2021-06-29 21:52:09.374365+02:00,4,0.0,5540,1,,,e4d7beeb0a631e2e51e61951623ba9b1
4,648,0.0,False,49,10,2021-06-29 21:33:17.369471+02:00,2021-06-29 21:33:17.369471+02:00,15,0.0,626,1,,,07a5dd9e125345b9458651ab73605255


#### Analyze pricing files

Meaning pricing_formulas.csv and type_of_pricing_formulas

In [18]:
pricing_formulas = display_databases("8pricing_formulas.csv")
pricing_formulas.head()

File path :  bdc2324-data/8/8pricing_formulas.csv
Shape :  (516, 6)


Unnamed: 0,id,name,created_at,updated_at,extra_field,identifier
0,7,visite stade enfant,2021-06-29 21:33:14.160728+02:00,2021-06-29 21:33:14.160728+02:00,,bbc80e5761a0ea325f6f6a5411752659
1,3229,tarif bloc etudiants,2021-07-16 04:20:46.684601+02:00,2021-09-03 16:44:46.096785+02:00,,205122cc7e96d559330972b0ec0cf35a
2,42,invitation eiffage,2021-06-29 21:33:14.204483+02:00,2021-06-29 21:33:14.204483+02:00,,e4e6365c02e2a7b01ebe2ce8ace624f2
3,4379,invitation offre speciale,2021-07-16 05:21:44.984893+02:00,2021-07-16 05:21:44.984893+02:00,,307817b6205535a35915a64027ee161e
4,2641,prevente reabo enfant,2021-07-16 03:47:40.896805+02:00,2021-09-03 16:08:35.304298+02:00,,478eb63c71ba35d8d3d64c8637dafdee


In [19]:
type_pricing_formulas = display_databases("8type_of_pricing_formulas.csv")
type_pricing_formulas.head()

File path :  bdc2324-data/8/8type_of_pricing_formulas.csv
Shape :  (103, 6)


Unnamed: 0,id,type_of_id,pricing_formula_id,created_at,updated_at,identifier
0,1,7,1021,2021-09-03 14:17:19.816110+02:00,2021-09-03 14:17:19.816110+02:00,41047fbeb7cd3e1cb2713c608d2f786d
1,2,7,4305,2021-09-03 14:17:19.848088+02:00,2021-09-03 14:17:19.848088+02:00,a62a4dad7d62738129244bbb5ede0747
2,3,7,4306,2021-09-03 14:17:19.864067+02:00,2021-09-03 14:17:19.864067+02:00,c3770373e09f55412068c447736d9da3
3,4,7,29,2021-09-03 14:17:19.880078+02:00,2021-09-03 14:17:19.880078+02:00,7b7b1242ae7a8c9eb66d35d8a4348ccd
4,5,8,10,2021-09-03 14:18:03.616081+02:00,2021-09-03 14:18:03.616081+02:00,0a2b941c46b31258c03b316aa064e86a


#### Analyze type of products

Meaning categories.csv, type_of_categories.csv

In [20]:
categories = display_databases("8categories.csv")
categories.head()

File path :  bdc2324-data/8/8categories.csv
Shape :  (148, 7)


Unnamed: 0,id,name,created_at,updated_at,extra_field,quota,identifier
0,653,acces village implid,2021-07-16 00:04:37.181331+02:00,2021-07-16 00:04:37.181331+02:00,,,c447d053646a6503d3cd84d4798bf5b7
1,805,parking organisation,2021-07-16 01:54:15.822407+02:00,2021-07-16 01:54:15.822407+02:00,,,02bf9871964345f505ad305080daec36
2,809,rose rouge orange,2021-07-16 01:54:15.825345+02:00,2021-07-16 01:54:15.825345+02:00,,,31fb5b57bc1a2bcd5c155fb0d9e7c0dd
3,2183,2eme catégorie j.b. centrale,2021-07-16 04:37:25.446835+02:00,2021-07-16 04:37:25.446835+02:00,,,c9eb6651caaed42b809b3f4407a847c9
4,621,acces brasserie,2021-07-16 00:02:17.249701+02:00,2021-07-16 00:02:17.249701+02:00,,,349e6a59585d78d80d46acbc6a520c50


In [21]:
type_categories = display_databases("8type_of_categories.csv")
type_categories.head()

File path :  bdc2324-data/8/8type_of_categories.csv
Shape :  (6, 6)


Unnamed: 0,id,type_of_id,category_id,created_at,updated_at,identifier
0,1,1,2,2021-08-20 15:22:05.558209+02:00,2021-08-20 15:22:05.558209+02:00,af8fa6d57f6b19a7600a69e7771c7c3a
1,2,2,1,2021-09-02 17:29:32.582002+02:00,2021-09-02 17:29:32.582002+02:00,63718e7ad306912427758ddf988ad34f
2,3,3,3,2021-09-02 17:32:38.299733+02:00,2021-09-02 17:32:38.299733+02:00,5e147d4d90888df14c4584f5c6887c96
3,4,4,4,2021-09-02 17:35:04.748993+02:00,2021-09-02 17:35:04.748993+02:00,a9dfdc3f40b41e3018933c6167fc38a5
4,5,5,17,2021-09-02 17:35:37.396740+02:00,2021-09-02 17:35:37.396740+02:00,c05b0061d2a875adbc35d3dfa6a50a12


#### Analyze type of representations

Meaning representation_category_capacities.csv, representations.csv, representations_types.csv

however there is no representation_types database

In [22]:
representation_category_capacities = display_databases("8representation_category_capacities.csv")
representation_category_capacities.head()

File path :  bdc2324-data/8/8representation_category_capacities.csv
Shape :  (7378, 7)


Unnamed: 0,id,created_at,updated_at,representation_id,category_id,expected_filling,max_filling
0,561,2021-06-29 21:33:14.096827+02:00,2021-06-29 21:33:14.096827+02:00,17,37,,
1,571,2021-06-29 21:33:14.110047+02:00,2021-06-29 21:33:14.110047+02:00,14,39,,
2,9665,2021-07-16 00:02:17.736387+02:00,2021-07-16 00:02:17.736387+02:00,1887,8,,
3,383906,2023-03-04 02:55:01.585418+01:00,2023-03-04 02:55:01.585418+01:00,52729,476,,
4,393,2021-06-29 21:33:13.876766+02:00,2021-06-29 21:33:13.876766+02:00,9,23,,


In [23]:
representations = display_databases("8representations.csv")
representations.head()

File path :  bdc2324-data/8/8representations.csv
Shape :  (1015, 16)


Unnamed: 0,id,serial,event_id,created_at,updated_at,start_date_time,open,satisfaction,end_date_time,name,is_display,representation_type_id,expected_filling,max_filling,extra_field,identifier
0,5903,,5836,2021-07-16 05:16:57.419565+02:00,2021-07-16 05:16:57.419565+02:00,2019-08-24 18:00:00+02:00,True,,1901-01-01 00:09:21+00:09,,True,,,,,8009c34cae4e79e3781f16f3ceeab244
1,67133,,65652,2023-09-27 02:21:36.573001+02:00,2023-09-27 02:21:36.573001+02:00,2023-10-04 10:30:00+02:00,True,,1901-01-01 00:09:21+00:09,,True,,,,,4e9d3fc8d1f7bf563dc586548fe6390e
2,1874,,1826,2021-07-16 00:02:17.390274+02:00,2021-07-16 00:02:17.390274+02:00,2019-09-14 18:00:00+02:00,True,,1901-01-01 00:09:21+00:09,,True,,,,,19f666370c1fc781dff638c20ae04c8a
3,5904,,5837,2021-07-16 05:16:57.420302+02:00,2021-07-16 05:16:57.420302+02:00,2019-09-01 17:05:00+02:00,True,,1901-01-01 00:09:21+00:09,,True,,,,,4221acd3f49179f5d0b292c15d1ab8e4
4,4165,,4106,2021-07-16 03:53:05.929713+02:00,2021-07-16 03:53:05.929713+02:00,2018-10-14 14:00:00+02:00,True,,1901-01-01 00:09:21+00:09,,True,,,,,733104286519c0614b2d45470eb180a1


In [24]:
#representation_type = display_databases("8representation_types.csv")

#### Analyze type of events

Meaning events.csv, event_types.csv, seasons.csv and facilities.csv

In [25]:
events = display_databases("8events.csv")
events.head()

File path :  bdc2324-data/8/8events.csv
Shape :  (922, 12)


Unnamed: 0,id,created_at,updated_at,season_id,facility_id,name,event_type_id,manual_added,is_display,event_type_key_id,facility_key_id,identifier
0,41542,2022-10-29 02:54:32.756920+02:00,2022-10-29 02:57:35.511792+02:00,52,1,match lou feminin - lons,5588,False,True,5588,1,40cc5a346b1af4ee7108ac28b144fb77
1,21068,2021-12-17 03:43:53.166446+01:00,2021-12-17 03:46:40.346096+01:00,51,1,repas brasserie lou-racing,2310,False,True,2310,1,500b670b79aa592ecb06f4957800a752
2,59812,2023-05-26 01:45:54.321665+02:00,2023-05-26 01:46:01.571397+02:00,1501,2,parking match 2,10185,False,True,10185,2,d5f62ed879867b8b51ed7b85f1fc3ab0
3,3424,2021-07-16 03:13:06.988358+02:00,2021-07-16 05:33:31.321933+02:00,1,1,rugby + hockey sur glace,5,False,True,5,1,822b47176c355a647aa2dbdf8dfbc594
4,21379,2021-12-23 02:37:22.948114+01:00,2021-12-23 02:38:20.726329+01:00,51,1,bloc des etudiants lou-racing,2562,False,True,2562,1,17b91f19c71ff6287ffc1f44af952576


In [26]:
event_types = display_databases("8event_types.csv")
event_types.head()

File path :  bdc2324-data/8/8event_types.csv
Shape :  (73, 6)


Unnamed: 0,id,name,created_at,updated_at,fidelity_delay,identifier
0,1,standard,2021-06-29 13:52:10.434850+02:00,2021-06-29 13:52:10.434850+02:00,36,c00f0c4675b91fb8b918e4079a0b1bac
1,11,ptit lou,2021-06-29 21:33:13.000743+02:00,2021-06-29 21:33:13.000743+02:00,36,dedd3579bc13b3ed7a90277247d9944b
2,274,parking 19-20,2021-07-16 00:02:17.225410+02:00,2021-07-16 00:02:17.225410+02:00,36,0d348caeec0b66f9d4987dfbe30e1e8b
3,129,events 2018-2019,2021-06-30 01:35:18.110429+02:00,2021-06-30 01:35:18.110429+02:00,36,65eb39ddf8f79d28d93c2f2c53118f50
4,10,accreditations 2017-2018,2021-06-29 21:33:12.999510+02:00,2021-06-29 21:33:12.999510+02:00,36,732cfdcf2065fa0005faf42793ddd76c


In [27]:
seasons = display_databases("8seasons.csv")
seasons.head()

File path :  bdc2324-data/8/8seasons.csv
Shape :  (16, 6)


Unnamed: 0,id,name,created_at,updated_at,start_date_time,identifier
0,1501,saison 2023-2024,2022-06-25 03:07:31.209270+02:00,2022-06-25 03:07:31.209270+02:00,,71f5c069ce45c5e933dcc37c22507fbf
1,1194,saison 2049-2050,2022-02-17 03:24:23.942691+01:00,2022-02-17 03:24:23.942691+01:00,,44e20620bbc5926db2e295d38b606afd
2,2,saison 2016-2017,2021-06-29 21:33:00.702563+02:00,2021-06-29 21:33:00.702563+02:00,,f9cf989d4f49300220df67ef93aa2294
3,47,saison 2018-2019,2021-06-30 01:35:15.156097+02:00,2021-06-30 01:35:15.156097+02:00,,eec50c35fbf8593b364ced287335d90c
4,100,saison 2010-2011,2021-07-16 00:23:27.607648+02:00,2021-07-16 00:23:27.607648+02:00,,7ccc51049a85e0df9b80662e45b6ddb8


In [28]:
facilities = display_databases("8facilities.csv")
facilities.head()

File path :  bdc2324-data/8/8facilities.csv
Shape :  (5, 7)


Unnamed: 0,id,name,created_at,updated_at,street_id,fixed_capacity,identifier
0,74,plan pour campagne d'abo 2011/2012,2021-07-16 00:23:30.337698+02:00,2021-07-16 00:23:30.337698+02:00,1,,2e1d25d5f7e46e23c734fe0e4951390e
1,3,accreditation,2021-06-29 21:33:13.018552+02:00,2021-06-29 21:33:13.018552+02:00,1,,da37a04e592cbd344142730ce05a6887
2,4,organisation match exterieur,2021-06-29 21:33:13.019878+02:00,2021-06-29 21:33:13.019878+02:00,1,,8f9ee8c2e954585f7c68096d7f1cf4f1
3,2,parking matmut stadium,2021-06-29 21:33:13.017165+02:00,2021-06-29 21:33:13.017165+02:00,1,,aeab282982ea738674dbf5c3763a0be0
4,1,matmut stadium,2021-06-29 21:33:13.004560+02:00,2021-06-29 21:33:13.004560+02:00,1,,89feffd283ebdabdc3b81fb62ea4f6f0


#### Analyze annexe databases

Meaning contributions.csv, contribution_sites.csv, currencies.csv, countries.csv and type_ofs.csc

## II. Identify Commons Datasets

From the analyze of the 8th company, we notice that some databases does not exist. Therefore, in order to construct a uniform database for all companies, we should first identify the common databases between all companies

In [29]:
## We first construct a dictionary reporting all the datasets for each companies

companies = fs.ls(BUCKET)
companies_database = {}

for company in companies:
    companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] 


In [30]:
# Then we create a list of all database

all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]
print("Number of databases : ",len(all_database))

Number of databases :  30


In [31]:
## We then create a set of database in common for all companies

data_in_common = set(all_database)

print(len(data_in_common))

for key in companies_database:
    diff_database = data_in_common.symmetric_difference(companies_database[key])
    data_in_common = data_in_common - diff_database

print(len(data_in_common))
    

30
23


## Create Universal database

We will first create a procedure to clean the datasets of a company and then merge them. Hence, we will be able to replicate this procedure for all companies and create a universal database.

Let's first create our procedure for the company 1 and the datasets belongings to the theme producst

In [32]:
directory_path = '1'

In [33]:
theme_products = ["products.csv" ,"categories.csv", "type_of_categories.csv"]

In [34]:
def remove_horodates(df):
    """
    this function remove horodate columns like created_at and updated_at
    """
    df = df.drop(columns = ["created_at", "updated_at"])
    return df

In [35]:
def order_columns_id(df):
    """
    this function puts all id columns at the beginning in order to read the dataset easier
    """
    substring = 'id'
    id_columns = [col for col in df.columns if substring in col]
    remaining_col = [col for col in df.columns if substring not in col]
    new_order = id_columns + remaining_col
    return df[new_order]

In [36]:
def percent_na(df):
    """
    this function returns the percentage of na for each column
    """
    percent_missing = df.isna().sum() * 100 / len(df)
    return percent_missing

In [37]:
def process_df(df):
    df = remove_horodates(df)
    print("Number of columns : ", len(df.columns))
    df = order_columns_id(df)
    print("Columns : ", df.columns)
    print("Percent of NA for each column : ", percent_na(df))
    return df

#### Deep analysis of products.csv

In [38]:
products = display_databases("1products.csv")
print("Number of columns : ", len(products.columns))
products.head()

File path :  bdc2324-data/1/1products.csv
Shape :  (94803, 14)
Number of columns :  14


Unnamed: 0,id,amount,is_full_price,representation_id,pricing_formula_id,created_at,updated_at,category_id,apply_price,products_group_id,product_pack_id,extra_field,amount_consumption,identifier
0,10682,9.0,False,914,114,2020-09-03 14:09:43.119798+02:00,2020-09-03 14:09:43.119798+02:00,41,0.0,10655,1,,,35c88f2db8a63d7474e46eb8ca9260e7
1,478,9.5,False,273,131,2020-09-03 13:21:22.711773+02:00,2020-09-03 13:21:22.711773+02:00,1,0.0,471,1,,,8a179671ab198e570e6a104c4451379f
2,20873,11.5,False,275,137,2020-09-03 14:46:33.589030+02:00,2020-09-03 14:46:33.589030+02:00,1,0.0,20825,1,,,ee83779ce29e67ad251e40234b426d6a
3,157142,8.0,False,82519,9,2022-01-28 19:29:23.525722+01:00,2022-01-28 19:29:23.525722+01:00,5,0.0,156773,1,,,d865383579314b791aa4bcf3fb418f17
4,1341,8.5,False,9,93,2020-09-03 13:29:30.773089+02:00,2020-09-03 13:29:30.773089+02:00,1,0.0,1175,1,,,f1c4689bc47dee6f60b56d74b593dd46


In [39]:
products = remove_horodates(products)
print("Number of columns : ", len(products.columns))
products = order_columns_id(products)
print("Columns : ", products.columns)
products.head()

Number of columns :  12
Columns :  Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',
       'products_group_id', 'product_pack_id', 'identifier', 'amount',
       'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],
      dtype='object')


Unnamed: 0,id,representation_id,pricing_formula_id,category_id,products_group_id,product_pack_id,identifier,amount,is_full_price,apply_price,extra_field,amount_consumption
0,10682,914,114,41,10655,1,35c88f2db8a63d7474e46eb8ca9260e7,9.0,False,0.0,,
1,478,273,131,1,471,1,8a179671ab198e570e6a104c4451379f,9.5,False,0.0,,
2,20873,275,137,1,20825,1,ee83779ce29e67ad251e40234b426d6a,11.5,False,0.0,,
3,157142,82519,9,5,156773,1,d865383579314b791aa4bcf3fb418f17,8.0,False,0.0,,
4,1341,9,93,1,1175,1,f1c4689bc47dee6f60b56d74b593dd46,8.5,False,0.0,,


In [40]:
print(products.dtypes)

id                      int64
representation_id       int64
pricing_formula_id      int64
category_id             int64
products_group_id       int64
product_pack_id         int64
identifier             object
amount                float64
is_full_price            bool
apply_price           float64
extra_field           float64
amount_consumption    float64
dtype: object


In [41]:
percent_missing = products.isna().sum() * 100 / len(products)
print(percent_missing)

id                      0.0
representation_id       0.0
pricing_formula_id      0.0
category_id             0.0
products_group_id       0.0
product_pack_id         0.0
identifier              0.0
amount                  0.0
is_full_price           0.0
apply_price             0.0
extra_field           100.0
amount_consumption    100.0
dtype: float64


#### Deep analysis of categories.csv

In [42]:
name_dataset = '1categories.csv'

In [43]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

File path :  bdc2324-data/1/1categories.csv
Shape :  (27, 7)
Number of columns :  7


Unnamed: 0,id,name,created_at,updated_at,extra_field,quota,identifier
0,30,en nb entrées gr,2020-09-03 13:21:20.019202+02:00,2020-09-03 13:21:20.019202+02:00,,,849ab2791a14f5fc2bb4d87ab2b78bf6
1,16,indiv activité enfant,2020-09-03 13:11:23.306968+02:00,2020-09-03 13:11:23.306968+02:00,,,425fd2f01984cc4ba030c1be98f42c33
2,39,indiv activité gr,2020-09-03 13:21:20.029901+02:00,2020-09-03 13:21:20.029901+02:00,,,9244dd3738788db0d22a5d0afe687b69
3,1108,groupe forfait adulte,2020-09-19 02:06:43.145697+02:00,2020-09-19 02:06:43.145697+02:00,,,3edda20c877a93b5ff883827238eb711
4,6,groupe forfait entrées tr,2020-09-03 13:11:23.264997+02:00,2020-09-03 13:11:23.264997+02:00,,,ff48df4b2dd5a14116bf4d280b31621e


In [44]:
df = process_df(df)
df.head()

Number of columns :  5
Columns :  Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')
Percent of NA for each column :  id               0.000000
identifier       0.000000
name             3.703704
extra_field    100.000000
quota          100.000000
dtype: float64


Unnamed: 0,id,identifier,name,extra_field,quota
0,30,849ab2791a14f5fc2bb4d87ab2b78bf6,en nb entrées gr,,
1,16,425fd2f01984cc4ba030c1be98f42c33,indiv activité enfant,,
2,39,9244dd3738788db0d22a5d0afe687b69,indiv activité gr,,
3,1108,3edda20c877a93b5ff883827238eb711,groupe forfait adulte,,
4,6,ff48df4b2dd5a14116bf4d280b31621e,groupe forfait entrées tr,,


In [45]:
df.dtypes

id               int64
identifier      object
name            object
extra_field    float64
quota          float64
dtype: object

#### Deep analysis of type_of_categories.csv

#### Deep analysis of representation_category_capacities.csv

#### Deep analysis of representations.csv

#### Deep analysis of events.csv

In [46]:
name_dataset = '1events.csv'

In [47]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

File path :  bdc2324-data/1/1events.csv
Shape :  (1232, 12)
Number of columns :  12


Unnamed: 0,id,created_at,updated_at,season_id,facility_id,name,event_type_id,manual_added,is_display,event_type_key_id,facility_key_id,identifier
0,192,2020-09-03 13:36:42.216991+02:00,2021-11-02 15:06:40.663219+01:00,16,1,frontières,4,False,True,4,1,c1cecd093146068fd57896e254e98170
1,30329,2023-11-04 02:50:34.602462+01:00,2023-11-04 02:52:26.138154+01:00,2767,1,visite guidée une autre histoire du monde (1h00),5,False,True,5,1,f510a6710878d7aca36e71c54abab525
2,161,2020-09-03 13:29:27.944002+02:00,2021-11-02 15:06:40.652026+01:00,16,1,visite contée les chercheurs d'or indiv,2,False,True,2,1,21177fa9acad1ae2b1f595690fb853d3
3,5957,2021-07-31 11:16:42.575583+02:00,2021-11-02 15:06:40.663219+01:00,582,1,we dreamt of utopia and we woke up screaming.,4,False,True,4,1,962601f1eb153d45d49437f8fe839f7f
4,8337,2021-08-17 13:40:34.111923+02:00,2021-11-02 15:06:40.663219+01:00,582,1,jeff koons épisodes 4,4,False,True,4,1,bfa22f5a2364a2dacfc45cca1c8d3215


In [48]:
df = process_df(df)
df.head()

Number of columns :  10
Columns :  Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',
       'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],
      dtype='object')
Percent of NA for each column :  id                   0.000000
season_id            0.000000
facility_id          0.000000
event_type_id        0.000000
event_type_key_id    0.000000
facility_key_id      0.000000
identifier           0.000000
name                 0.974026
manual_added         0.000000
is_display           0.000000
dtype: float64


Unnamed: 0,id,season_id,facility_id,event_type_id,event_type_key_id,facility_key_id,identifier,name,manual_added,is_display
0,192,16,1,4,4,1,c1cecd093146068fd57896e254e98170,frontières,False,True
1,30329,2767,1,5,5,1,f510a6710878d7aca36e71c54abab525,visite guidée une autre histoire du monde (1h00),False,True
2,161,16,1,2,2,1,21177fa9acad1ae2b1f595690fb853d3,visite contée les chercheurs d'or indiv,False,True
3,5957,582,1,4,4,1,962601f1eb153d45d49437f8fe839f7f,we dreamt of utopia and we woke up screaming.,False,True
4,8337,582,1,4,4,1,bfa22f5a2364a2dacfc45cca1c8d3215,jeff koons épisodes 4,False,True


In [49]:
df.dtypes

id                    int64
season_id             int64
facility_id           int64
event_type_id         int64
event_type_key_id     int64
facility_key_id       int64
identifier           object
name                 object
manual_added           bool
is_display             bool
dtype: object

#### Deep analysis of event_types.csv

In [50]:
name_dataset = '1event_types.csv'

In [51]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

File path :  bdc2324-data/1/1event_types.csv
Shape :  (9, 6)
Number of columns :  6


Unnamed: 0,id,name,created_at,updated_at,fidelity_delay,identifier
0,1,standard,2020-09-03 12:24:22.574262+02:00,2020-09-03 12:24:22.574262+02:00,36,c00f0c4675b91fb8b918e4079a0b1bac
1,66,package,2020-09-03 14:05:04.648137+02:00,2020-09-03 14:05:04.648137+02:00,36,efe90a8e604a7c840e88d03a67f6b7d8
2,83,guide multimédias,2020-09-03 14:15:17.252539+02:00,2020-09-03 14:15:17.252539+02:00,36,ee14c62b3b9f6c7dd5401685a18e4460
3,3,non défini,2020-09-03 13:11:23.117024+02:00,2020-09-03 13:11:23.117024+02:00,36,52ff3466787b4d538407372e5f7afe0f
4,2723,,2021-12-22 09:45:47.715105+01:00,2021-12-22 09:45:47.715105+01:00,36,d41d8cd98f00b204e9800998ecf8427e


In [52]:
df = process_df(df)
df.head()

Number of columns :  4
Columns :  Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')
Percent of NA for each column :  id                 0.000000
fidelity_delay     0.000000
identifier         0.000000
name              11.111111
dtype: float64


Unnamed: 0,id,fidelity_delay,identifier,name
0,1,36,c00f0c4675b91fb8b918e4079a0b1bac,standard
1,66,36,efe90a8e604a7c840e88d03a67f6b7d8,package
2,83,36,ee14c62b3b9f6c7dd5401685a18e4460,guide multimédias
3,3,36,52ff3466787b4d538407372e5f7afe0f,non défini
4,2723,36,d41d8cd98f00b204e9800998ecf8427e,


In [53]:
df.dtypes

id                 int64
fidelity_delay     int64
identifier        object
name              object
dtype: object

#### Deep analysis of seasons.csv

In [54]:
name_dataset = '1seasons.csv'

In [55]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

File path :  bdc2324-data/1/1seasons.csv
Shape :  (13, 6)
Number of columns :  6


Unnamed: 0,id,name,created_at,updated_at,start_date_time,identifier
0,943,2013,2021-07-29 08:55:33.282607+02:00,2021-07-29 08:55:33.282607+02:00,,8038da89e49ac5eabb489cfc6cea9fc1
1,129,2014,2020-09-03 15:13:08.105567+02:00,2020-09-03 15:13:08.105567+02:00,,cee8d6b7ce52554fd70354e37bbf44a2
2,3,2015,2020-09-03 13:11:19.405037+02:00,2020-09-03 13:11:19.405037+02:00,,65d2ea03425887a717c435081cfc5dbb
3,2,2016,2020-09-03 13:11:19.401001+02:00,2020-09-03 13:11:19.401001+02:00,,95192c98732387165bf8e396c0f2dad2
4,4,2017,2020-09-03 13:11:19.409005+02:00,2020-09-03 13:11:19.409005+02:00,,8d8818c8e140c64c743113f563cf750f


In [56]:
df = process_df(df)
df.head()

Number of columns :  4
Columns :  Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')
Percent of NA for each column :  id                   0.000000
identifier           0.000000
name                 7.692308
start_date_time    100.000000
dtype: float64


Unnamed: 0,id,identifier,name,start_date_time
0,943,8038da89e49ac5eabb489cfc6cea9fc1,2013,
1,129,cee8d6b7ce52554fd70354e37bbf44a2,2014,
2,3,65d2ea03425887a717c435081cfc5dbb,2015,
3,2,95192c98732387165bf8e396c0f2dad2,2016,
4,4,8d8818c8e140c64c743113f563cf750f,2017,


In [57]:
df.dtypes

id                   int64
identifier          object
name                object
start_date_time    float64
dtype: object

#### Deep Analysis of facilities.csv

In [58]:
name_dataset = '1facilities.csv'

In [59]:
df = display_databases(name_dataset)
print("Number of columns : ", len(df.columns))
df.head()

File path :  bdc2324-data/1/1facilities.csv
Shape :  (2, 7)
Number of columns :  7


Unnamed: 0,id,name,created_at,updated_at,street_id,fixed_capacity,identifier
0,2,non défini,2020-09-03 13:16:35.293111+02:00,2020-09-03 13:16:35.293111+02:00,2,,52ff3466787b4d538407372e5f7afe0f
1,1,mucem,2020-09-03 13:11:23.133059+02:00,2020-09-03 13:11:23.133059+02:00,1,,702bd76fe3dd5dbcf118a6965a946f54


In [60]:
df = process_df(df)
df.head()

Number of columns :  5
Columns :  Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')
Percent of NA for each column :  id                  0.0
street_id           0.0
identifier          0.0
name                0.0
fixed_capacity    100.0
dtype: float64


Unnamed: 0,id,street_id,identifier,name,fixed_capacity
0,2,2,52ff3466787b4d538407372e5f7afe0f,non défini,
1,1,1,702bd76fe3dd5dbcf118a6965a946f54,mucem,


In [61]:
df.dtypes

id                  int64
street_id           int64
identifier         object
name               object
fixed_capacity    float64
dtype: object

## Merge

In [82]:
def process_df_2(df):
    df = remove_horodates(df)
    print("Number of columns : ", len(df.columns))
    df = order_columns_id(df)
    print("Columns : ", df.columns)
    return df

In [83]:
def load_dataset(name):
    df = display_databases(name)
    df = process_df_2(df)
    # drop na :
    #df = df.dropna(axis=1, thresh=len(df))
    # if identifier in table : delete it
    if 'identifier' in df.columns:
        df = df.drop(columns = 'identifier')
    return df

### Products Table

In [84]:
def create_products_table():
    # first merge products and categories
    print("first merge products and categories")
    products = load_dataset("1products.csv")
    categories = load_dataset("1categories.csv")
    # Drop useless columns
    products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])
    categories = categories.drop(columns = ['extra_field', 'quota'])

    #Merge
    products_theme = products.merge(categories, how = 'left', left_on = 'category_id',
                                    right_on = 'id', suffixes=('_products', '_categories'))
    products_theme = products_theme.rename(columns = {"name" : "name_categories"})
    
    # Second merge products_theme and type of categories
    print("Second merge products_theme and type of categories")
    type_of_categories = load_dataset("1type_of_categories.csv")
    type_of_categories = type_of_categories.drop(columns = 'id')
    products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id',
                                          right_on = 'category_id' )

    # Index cleaning
    products_theme = products_theme.drop(columns = ['id_categories'])
    products_theme  = order_columns_id(products_theme)

    

    return products_theme

In [85]:
products_theme = create_products_table()
products_theme.head()

first merge products and categories
File path :  bdc2324-data/1/1products.csv
Shape :  (94803, 14)
Number of columns :  12
Columns :  Index(['id', 'representation_id', 'pricing_formula_id', 'category_id',
       'products_group_id', 'product_pack_id', 'identifier', 'amount',
       'is_full_price', 'apply_price', 'extra_field', 'amount_consumption'],
      dtype='object')
File path :  bdc2324-data/1/1categories.csv
Shape :  (27, 7)
Number of columns :  5
Columns :  Index(['id', 'identifier', 'name', 'extra_field', 'quota'], dtype='object')
Second merge products_theme and type of categories
File path :  bdc2324-data/1/1type_of_categories.csv
Shape :  (5, 6)
Number of columns :  4
Columns :  Index(['id', 'type_of_id', 'category_id', 'identifier'], dtype='object')


Unnamed: 0,id_products,representation_id,pricing_formula_id,category_id,products_group_id,product_pack_id,type_of_id,amount,is_full_price,name_categories
0,10682,914,114,41,10655,1,,9.0,False,indiv activité tr
1,478,273,131,1,471,1,12.0,9.5,False,indiv entrées tp
2,20873,275,137,1,20825,1,12.0,11.5,False,indiv entrées tp
3,157142,82519,9,5,156773,1,,8.0,False,indiv entrées tr
4,1341,9,93,1,1175,1,12.0,8.5,False,indiv entrées tp


### Events Table

In [86]:
def create_events_table():
    # first merge events and seasons : 
    print("first merge events and seasons : ")
    events = load_dataset("1events.csv")
    seasons = load_dataset("1seasons.csv")

    # Drop useless columns
    events = events.drop(columns = ['manual_added', 'is_display'])
    seasons = seasons.drop(columns = ['start_date_time'])
        
    events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))

    # Secondly merge events_theme and event_types
    print("Secondly merge events_theme and event_types : ")
    event_types = load_dataset("1event_types.csv")
    event_types = event_types.drop(columns = ['fidelity_delay'])
    
    events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))
    events_theme = events_theme.rename(columns = {"name" : "name_event_types"})
    events_theme = events_theme.drop(columns = 'id')

    # thirdly merge events_theme and facilities
    print("thirdly merge events_theme and facilities : ")
    facilities = load_dataset("1facilities.csv")
    facilities = facilities.drop(columns = ['fixed_capacity'])
    
    events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))
    events_theme = events_theme.rename(columns = {"name" : "name_facilities", "id_events" : "event_id"})
    events_theme = events_theme.drop(columns = 'id')

    # Index cleaning
    events_theme = events_theme.drop(columns = ['id_seasons'])
    events_theme  = order_columns_id(events_theme)
    return events_theme

In [87]:
events_theme= create_events_table()
events_theme.head()

first merge events and seasons : 
File path :  bdc2324-data/1/1events.csv
Shape :  (1232, 12)
Number of columns :  10
Columns :  Index(['id', 'season_id', 'facility_id', 'event_type_id', 'event_type_key_id',
       'facility_key_id', 'identifier', 'name', 'manual_added', 'is_display'],
      dtype='object')
File path :  bdc2324-data/1/1seasons.csv
Shape :  (13, 6)
Number of columns :  4
Columns :  Index(['id', 'identifier', 'name', 'start_date_time'], dtype='object')
Secondly merge events_theme and event_types : 
File path :  bdc2324-data/1/1event_types.csv
Shape :  (9, 6)
Number of columns :  4
Columns :  Index(['id', 'fidelity_delay', 'identifier', 'name'], dtype='object')
thirdly merge events_theme and facilities : 
File path :  bdc2324-data/1/1facilities.csv
Shape :  (2, 7)
Number of columns :  5
Columns :  Index(['id', 'street_id', 'identifier', 'name', 'fixed_capacity'], dtype='object')


Unnamed: 0,event_id,season_id,facility_id,event_type_id,event_type_key_id,facility_key_id,street_id,name_events,name_seasons,name_event_types,name_facilities
0,192,16,1,4,4,1,1,frontières,2018,spectacle vivant,mucem
1,30329,2767,1,5,5,1,1,visite guidée une autre histoire du monde (1h00),2023,offre muséale groupe,mucem
2,161,16,1,2,2,1,1,visite contée les chercheurs d'or indiv,2018,offre muséale individuel,mucem
3,5957,582,1,4,4,1,1,we dreamt of utopia and we woke up screaming.,2021,spectacle vivant,mucem
4,8337,582,1,4,4,1,1,jeff koons épisodes 4,2021,spectacle vivant,mucem


## Representations_Table

In [96]:
def create_representations_table():
    representations = load_dataset("1representations.csv")
    representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',
                                                     'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',
                                                     'representation_type_id'])
    
    representations_capacity = load_dataset("1representation_category_capacities.csv")
    representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])

    representations_theme = representations.merge(representations_capacity, how='left',
                                                  left_on='id', right_on='representation_id',
                                                  suffixes=('_representation', '_representation_cap'))
    # index cleaning
    representations_theme = representations_theme.drop(columns = ["id_representation"])
    representations_theme = order_columns_id(representations_theme)
    return representations_theme

In [97]:
representation_theme = create_representations_table()
representation_theme.head()

File path :  bdc2324-data/1/1representations.csv
Shape :  (36095, 16)
Number of columns :  14
Columns :  Index(['id', 'event_id', 'representation_type_id', 'identifier', 'serial',
       'start_date_time', 'open', 'satisfaction', 'end_date_time', 'name',
       'is_display', 'expected_filling', 'max_filling', 'extra_field'],
      dtype='object')
File path :  bdc2324-data/1/1representation_category_capacities.csv
Shape :  (65241, 7)
Number of columns :  5
Columns :  Index(['id', 'representation_id', 'category_id', 'expected_filling',
       'max_filling'],
      dtype='object')


Unnamed: 0,event_id,id_representation_cap,representation_id,category_id
0,12384,123058,84820,2
1,37,2514,269,2
2,37,384,269,5
3,37,2515,269,10
4,37,383,269,1


## Price Table

In [112]:
product_packs = load_dataset("1product_packs.csv")
product_packs.head()

File path :  bdc2324-data/1/1product_packs.csv
Shape :  (1, 6)
Number of columns :  4
Columns :  Index(['id', 'identifier', 'name', 'type_of'], dtype='object')


Unnamed: 0,id,name,type_of
0,1,,0


In [114]:
pricing_formula = load_dataset("1pricing_formulas.csv")
pricing_formula.head()

File path :  bdc2324-data/1/1pricing_formulas.csv
Shape :  (556, 6)
Number of columns :  4
Columns :  Index(['id', 'identifier', 'name', 'extra_field'], dtype='object')


Unnamed: 0,id,name,extra_field
0,41909,visite mécènes 1h30,
1,502,entree mucem tp( expo picasso),
2,504,nombre de personnes cinema,
3,117,spectacle tarif e famille tr,
4,1496,billet nb famille mecene 1a,


In [115]:
type_pricing_formula = load_dataset("1type_of_pricing_formulas.csv")
type_pricing_formula.head()

File path :  bdc2324-data/1/1type_of_pricing_formulas.csv
Shape :  (568, 6)
Number of columns :  4
Columns :  Index(['id', 'type_of_id', 'pricing_formula_id', 'identifier'], dtype='object')


Unnamed: 0,id,type_of_id,pricing_formula_id
0,1,1,127
1,2,1,2425
2,3,1,2937
3,4,1,48
4,5,1,7


In [117]:
product_groups = load_dataset("1products_groups.csv")
product_groups.head()

File path :  bdc2324-data/1/1products_groups.csv
Shape :  (92973, 9)
Number of columns :  7
Columns :  Index(['id', 'category_id', 'pricing_formula_id', 'representation_id',
       'percent_price', 'max_price', 'min_price'],
      dtype='object')


Unnamed: 0,id,category_id,pricing_formula_id,representation_id,percent_price,max_price,min_price
0,2735,8,97,1534,100.0,0.0,0.0
1,156773,5,9,82519,100.0,0.0,0.0
2,14387,16,79,8046,100.0,0.0,0.0
3,2770,2,37,1563,100.0,0.0,0.0
4,27179,13,119,14192,100.0,0.0,0.0


## Uniform Products theme database

In [98]:
print("Products theme columns : ", products_theme.columns)
print("\n Representation theme columns : ", representation_theme.columns)
print("\n Events theme columns : ", events_theme.columns)

Products theme columns :  Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',
       'products_group_id', 'product_pack_id', 'type_of_id', 'amount',
       'is_full_price', 'name_categories'],
      dtype='object')

 Representation theme columns :  Index(['event_id', 'id_representation_cap', 'representation_id',
       'category_id'],
      dtype='object')

 Events theme columns :  Index(['event_id', 'season_id', 'facility_id', 'event_type_id',
       'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',
       'name_seasons', 'name_event_types', 'name_facilities'],
      dtype='object')


In [99]:
products_global = products_theme.merge(representation_theme, how='left',
                                       on= ["representation_id", "category_id"])


products_global.head()

Unnamed: 0,id_products,representation_id,pricing_formula_id,category_id,products_group_id,product_pack_id,type_of_id,amount,is_full_price,name_categories,event_id,id_representation_cap
0,10682,914,114,41,10655,1,,9.0,False,indiv activité tr,132,8789
1,478,273,131,1,471,1,12.0,9.5,False,indiv entrées tp,37,390
2,20873,275,137,1,20825,1,12.0,11.5,False,indiv entrées tp,37,395
3,157142,82519,9,5,156773,1,,8.0,False,indiv entrées tr,12365,120199
4,1341,9,93,1,1175,1,12.0,8.5,False,indiv entrées tp,8,21


In [100]:
products_global = products_global.merge(events_theme, how='left', on='event_id',
                                        suffixes = ("_representation", "_event"))
products_global = order_columns_id(products_global)
products_global.head()

Unnamed: 0,id_products,representation_id,pricing_formula_id,category_id,products_group_id,product_pack_id,type_of_id,event_id,id_representation_cap,season_id,...,event_type_key_id,facility_key_id,street_id,amount,is_full_price,name_categories,name_events,name_seasons,name_event_types,name_facilities
0,10682,914,114,41,10655,1,,132,8789,4,...,5,1,1,9.0,False,indiv activité tr,"visite-jeu ""le classico des minots"" (1h30)",2017.0,offre muséale individuel,mucem
1,478,273,131,1,471,1,12.0,37,390,2,...,2,1,1,9.5,False,indiv entrées tp,billet mucem picasso,2016.0,offre muséale individuel,mucem
2,20873,275,137,1,20825,1,12.0,37,395,2,...,2,1,1,11.5,False,indiv entrées tp,billet mucem picasso,2016.0,offre muséale individuel,mucem
3,157142,82519,9,5,156773,1,,12365,120199,1754,...,4,1,1,8.0,False,indiv entrées tr,,,offre muséale individuel,mucem
4,1341,9,93,1,1175,1,12.0,8,21,4,...,6,1,1,8.5,False,indiv entrées tp,non défini,2017.0,non défini,mucem


In [101]:
products_global.columns

Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',
       'products_group_id', 'product_pack_id', 'type_of_id', 'event_id',
       'id_representation_cap', 'season_id', 'facility_id', 'event_type_id',
       'event_type_key_id', 'facility_key_id', 'street_id', 'amount',
       'is_full_price', 'name_categories', 'name_events', 'name_seasons',
       'name_event_types', 'name_facilities'],
      dtype='object')

In [102]:
products_global.shape

(94803, 22)

## Analysis of Products_global

In [103]:
products_global.isna().sum()

id_products                  0
representation_id            0
pricing_formula_id           0
category_id                  0
products_group_id            0
product_pack_id              0
type_of_id               67589
event_id                     0
id_representation_cap        0
season_id                    0
facility_id                  0
event_type_id                0
event_type_key_id            0
facility_key_id              0
street_id                    0
amount                       0
is_full_price                0
name_categories           3991
name_events              46657
name_seasons             30663
name_event_types             0
name_facilities              0
dtype: int64

In [105]:
# how many event types ?

products_global['name_event_types'].unique()

array(['offre muséale individuel', 'non défini', 'spectacle vivant',
       'offre muséale groupe', 'formule adhésion'], dtype=object)

In [107]:
# how many events ?

len(products_global['name_events'].unique())

644

In [108]:
# how many categories ?
products_global['name_categories'].unique()


array(['indiv activité tr', 'indiv entrées tp', 'indiv entrées tr',
       'indiv prog enfant', 'indiv activité gr', 'indiv prog gr',
       'indiv activité tp', 'indiv activité enfant', 'indiv entrées gr',
       'groupe forfait entrées tr', 'groupe autonome adulte',
       'indiv prog tp', 'indiv prog tr', 'indiv entrées fa',
       'groupe forfait scolaire', 'en nb entrées tr', 'non défini', nan,
       'en nb entrées gr', 'groupe autonome entrées gr',
       'groupe forfait entrées gr', 'groupe autonome entrées tr',
       'en nb entrées tp', 'groupe autonome gr',
       'groupe autonome entrées tp', 'groupe forfait adulte',
       'groupe forfait etudiant'], dtype=object)

In [109]:
len(products_global['category_id'].unique())

27

In [None]:
def uniform_product_df():
    """
    This function returns the uniform product dataset
    """
    print("Products theme columns : ", products_theme.columns)
    print("\n Representation theme columns : ", representation_theme.columns)
    print("\n Events theme columns : ", events_theme.columns)

    products_global = products_theme.merge(representation_theme, how='left',
                                           on= ["representation_id", "category_id"])
    
    products_global = products_global.merge(events_theme, how='left', on='event_id',
                                            suffixes = ("_representation", "_event"))
    
    products_global = order_columns_id(products_global)

    # remove useless columns 
    products_global = products_global.drop(columns = ['type_of_id', 'name_events', 'name_seasons', 'name_categories'])
    return products_global
    