From 5a13fc3c316a36ee00b844a14b65b44a42cd3c0e Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Mon, 19 Feb 2024 22:11:28 +0000 Subject: [PATCH] New structure --- 0_Cleaning_and_merge.py | 71 --- 0_Cleaning_and_merge_functions.py | 213 ++++----- 1_Descriptive_Statistics.ipynb | 50 +-- Brouillon_AJ.ipynb | 695 ------------------------------ Exploration_billet_AJ.ipynb | 6 +- 5 files changed, 138 insertions(+), 897 deletions(-) delete mode 100644 Brouillon_AJ.ipynb diff --git a/0_Cleaning_and_merge.py b/0_Cleaning_and_merge.py index 9c9aac0..860cec1 100644 --- a/0_Cleaning_and_merge.py +++ b/0_Cleaning_and_merge.py @@ -8,78 +8,7 @@ import re import warnings # Import cleaning and merge functions -exec(open('BDC-team-1/0_Cleaning_and_merge_functions.py').read()) exec(open('BDC-team-1/0_KPI_functions.py').read()) - -# Create filesystem object -S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] -fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) - -# Ignore warning -warnings.filterwarnings('ignore') - -# Data loading -BUCKET = "bdc2324-data/1" -liste_database = fs.ls(BUCKET) - -# loop to create dataframes from liste -client_number = liste_database[0].split("/")[1] -df_prefix = "df" + str(client_number) + "_" - -for i in range(len(liste_database)) : - current_path = liste_database[i] - with fs.open(current_path, mode="rb") as file_in: - df = pd.read_csv(file_in) - # the pattern of the name is df1xxx - nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3) - globals()[nom_dataframe] = df - -## 1 - Cleaning of the datasets - -# Cleaning customerplus -df1_customerplus_clean = preprocessing_customerplus(df1_customersplus) - -# Cleaning target area -df1_target_information = preprocessing_target_area(targets = df1_targets, target_types = df1_target_types, customer_target_mappings = df1_customer_target_mappings) - -# Cleaning campaign area -df1_campaigns_information = preprocessing_campaigns_area(campaign_stats = df1_campaign_stats, campaigns = df1_campaigns) - -# Exportation -BUCKET_OUT = "projet-bdc2324-team1" -FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Campaigns dataset clean.csv" -FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 - -with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: - df1_campaigns_information.to_csv(file_out, index = False) -## Cleaning product area - -# Cleaning ticket area -df1_ticket_information = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs) - - -BUCKET = "bdc2324-data" -directory_path = '1' - -products_theme = create_products_table() -events_theme= create_events_table() -representation_theme = create_representations_table() -products_global = uniform_product_df() - -# Fusion liée au product -df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner') - -# Selection des variables d'intérêts -df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']] - -#Exportation -BUCKET_OUT = "projet-bdc2324-team1" -FILE_KEY_OUT_S3 = "0_Temp/Company 1 - Purchases.csv" -FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 - -with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: - df1_products_purchased_reduced.to_csv(file_out, index = False) - ## 2 - Construction of KPIs on a given period def explanatory_variables(min_date, max_date, df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean): diff --git a/0_Cleaning_and_merge_functions.py b/0_Cleaning_and_merge_functions.py index 042c60e..cab8115 100644 --- a/0_Cleaning_and_merge_functions.py +++ b/0_Cleaning_and_merge_functions.py @@ -1,38 +1,92 @@ -# Cleaning and merge functions +#### Cleaning and merge functions #### -# Cleaning function +BUCKET = "bdc2324-data" + +# 1. Basic cleaning functions def cleaning_date(df, column_name): """ - Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601. - - Parameters: - - df: DataFrame - Le DataFrame contenant la colonne à nettoyer. - - column_name: str - Le nom de la colonne à nettoyer. - - Returns: - - DataFrame - Le DataFrame modifié avec la colonne nettoyée. + Datetime columns cleaning with ISO format """ df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601') return df -def preprocessing_customerplus(customerplus = None): +def display_databases(directory_path, file_name): + """ + This function returns the file from s3 storage + """ + file_path = BUCKET + "/" + directory_path + "/" + directory_path + file_name + ".csv" + print("File path : ", file_path) + with fs.open(file_path, mode="rb") as file_in: + df = pd.read_csv(file_in, sep=",") + + print("Shape : ", df.shape) + return df - customerplus_copy = customerplus.copy() +def remove_horodates(df): + """ + this function remove horodate columns like created_at and updated_at + """ + df = df.drop(columns = ["created_at", "updated_at"]) + return df + +def order_columns_id(df): + """ + this function puts all id columns at the beginning in order to read the dataset easier + """ + substring = 'id' + id_columns = [col for col in df.columns if substring in col] + remaining_col = [col for col in df.columns if substring not in col] + new_order = id_columns + remaining_col + return df[new_order] + +def process_df_2(df): + """ + This function organizes dataframe + """ + df = remove_horodates(df) + print("Number of columns : ", len(df.columns)) + df = order_columns_id(df) + print("Columns : ", df.columns) + return df + +def load_dataset(directory_path, name): + """ + This function loads csv file + """ + df = display_databases(directory_path, file_name = name) + df = process_df_2(df) + # drop na : + #df = df.dropna(axis=1, thresh=len(df)) + # if identifier in table : delete it + if 'identifier' in df.columns: + df = df.drop(columns = 'identifier') + return df + + +# 2. Creation of cleaned and merged datasets + +def preprocessing_customerplus(directory_path): + + customerplus_copy = load_dataset(directory_path, name = "customersplus") # Passage en format date cleaning_date(customerplus_copy, 'first_buying_date') cleaning_date(customerplus_copy, 'last_visiting_date') # Selection des variables - customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True) + customerplus_copy.drop(['lastname', 'firstname', 'birthdate', 'profession', 'language', 'age', 'email', 'civility', 'note', 'extra', 'reference', 'extra_field', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True) customerplus_copy.rename(columns = {'id' : 'customer_id'}, inplace = True) return customerplus_copy -def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None): +def preprocessing_tickets_area(directory_path): + + # Datasets loading + tickets = load_dataset(directory_path, name = "tickets") + purchases = load_dataset(directory_path, name = "purchases") + suppliers = load_dataset(directory_path, name = "suppliers") + type_ofs = load_dataset(directory_path, name = "type_ofs") + # Base des tickets tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']] tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True) @@ -48,7 +102,7 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non # Base des achats # Nettoyage de la date d'achat - cleaning_date(purchases, 'purchase_date') + # cleaning_date(purchases, 'purchase_date') # Selection des variables purchases = purchases[['id', 'purchase_date', 'customer_id']] @@ -67,8 +121,13 @@ def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = Non return ticket_information -def preprocessing_target_area(targets = None, target_types = None, customer_target_mappings = None): - # Target.csv cleaning +def preprocessing_target_area(directory_path): + + # Datasets loading + targets = load_dataset(directory_path, name = "targets") + target_types = load_dataset(directory_path, name = "target_types") + customer_target_mappings = load_dataset(directory_path, name = "customer_target_mappings") + # target cleaning targets = targets[["id", "target_type_id", "name"]] targets.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True) @@ -88,16 +147,21 @@ def preprocessing_target_area(targets = None, target_types = None, customer_targ return targets_full -def preprocessing_campaigns_area(campaign_stats = None, campaigns = None): +def preprocessing_campaigns_area(directory_path): + + # Datasets loading + campaign_stats = load_dataset(directory_path, name = "campaign_stats") + campaigns = load_dataset(directory_path, name = "campaigns") + # campaign_stats cleaning campaign_stats = campaign_stats[["id", "campaign_id", "customer_id", "opened_at", "sent_at", "delivered_at"]] - cleaning_date(campaign_stats, 'opened_at') - cleaning_date(campaign_stats, 'sent_at') - cleaning_date(campaign_stats, 'delivered_at') + # cleaning_date(campaign_stats, 'opened_at') + # cleaning_date(campaign_stats, 'sent_at') + # cleaning_date(campaign_stats, 'delivered_at') # campaigns cleaning campaigns = campaigns[["id", "name", "service_id", "sent_at"]].add_prefix("campaign_") - cleaning_date(campaigns, 'campaign_sent_at') + # cleaning_date(campaigns, 'campaign_sent_at') # Merge campaigns_full = pd.merge(campaign_stats, campaigns, on = "campaign_id", how = "left") @@ -105,66 +169,11 @@ def preprocessing_campaigns_area(campaign_stats = None, campaigns = None): return campaigns_full -def display_databases(file_name): - """ - This function returns the file from s3 storage - """ - file_path = BUCKET + "/" + directory_path + "/" + file_name - print("File path : ", file_path) - with fs.open(file_path, mode="rb") as file_in: - df = pd.read_csv(file_in, sep=",") - - print("Shape : ", df.shape) - return df - - -def remove_horodates(df): - """ - this function remove horodate columns like created_at and updated_at - """ - df = df.drop(columns = ["created_at", "updated_at"]) - return df - - -def order_columns_id(df): - """ - this function puts all id columns at the beginning in order to read the dataset easier - """ - substring = 'id' - id_columns = [col for col in df.columns if substring in col] - remaining_col = [col for col in df.columns if substring not in col] - new_order = id_columns + remaining_col - return df[new_order] - - -def process_df_2(df): - """ - This function organizes dataframe - """ - df = remove_horodates(df) - print("Number of columns : ", len(df.columns)) - df = order_columns_id(df) - print("Columns : ", df.columns) - return df - -def load_dataset(name): - """ - This function loads csv file - """ - df = display_databases(name) - df = process_df_2(df) - # drop na : - #df = df.dropna(axis=1, thresh=len(df)) - # if identifier in table : delete it - if 'identifier' in df.columns: - df = df.drop(columns = 'identifier') - return df - -def create_products_table(): +def create_products_table(directory_path): # first merge products and categories print("first merge products and categories") - products = load_dataset("1products.csv") - categories = load_dataset("1categories.csv") + products = load_dataset(directory_path, name = "products") + categories = load_dataset(directory_path, name = "categories") # Drop useless columns products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption']) categories = categories.drop(columns = ['extra_field', 'quota']) @@ -176,7 +185,7 @@ def create_products_table(): # Second merge products_theme and type of categories print("Second merge products_theme and type of categories") - type_of_categories = load_dataset("1type_of_categories.csv") + type_of_categories = load_dataset(directory_path, name = "type_of_categories") type_of_categories = type_of_categories.drop(columns = 'id') products_theme = products_theme.merge(type_of_categories, how = 'left', left_on = 'category_id', right_on = 'category_id' ) @@ -187,11 +196,11 @@ def create_products_table(): return products_theme -def create_events_table(): +def create_events_table(directory_path): # first merge events and seasons : print("first merge events and seasons : ") - events = load_dataset("1events.csv") - seasons = load_dataset("1seasons.csv") + events = load_dataset(directory_path, name = "events") + seasons = load_dataset(directory_path, name = "seasons") # Drop useless columns events = events.drop(columns = ['manual_added', 'is_display']) @@ -201,7 +210,7 @@ def create_events_table(): # Secondly merge events_theme and event_types print("Secondly merge events_theme and event_types : ") - event_types = load_dataset("1event_types.csv") + event_types = load_dataset(directory_path, name = "event_types") event_types = event_types.drop(columns = ['fidelity_delay']) events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type')) @@ -210,7 +219,7 @@ def create_events_table(): # thirdly merge events_theme and facilities print("thirdly merge events_theme and facilities : ") - facilities = load_dataset("1facilities.csv") + facilities = load_dataset(directory_path, name = "facilities") facilities = facilities.drop(columns = ['fixed_capacity']) events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility')) @@ -222,14 +231,13 @@ def create_events_table(): events_theme = order_columns_id(events_theme) return events_theme - -def create_representations_table(): - representations = load_dataset("1representations.csv") +def create_representations_table(directory_path): + representations = load_dataset(directory_path, name = "representations") representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling', 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name', 'representation_type_id']) - representations_capacity = load_dataset("1representation_category_capacities.csv") + representations_capacity = load_dataset(directory_path, name = "representation_category_capacities") representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling']) representations_theme = representations.merge(representations_capacity, how='left', @@ -240,22 +248,27 @@ def create_representations_table(): representations_theme = order_columns_id(representations_theme) return representations_theme -def uniform_product_df(): +def uniform_product_df(directory_path): """ This function returns the uniform product dataset """ + products_theme = create_products_table(directory_path) + representation_theme = create_representations_table(directory_path) + events_theme = create_events_table(directory_path) + ticket_information = preprocessing_tickets_area(directory_path) + print("Products theme columns : ", products_theme.columns) print("\n Representation theme columns : ", representation_theme.columns) print("\n Events theme columns : ", events_theme.columns) - products_global = products_theme.merge(representation_theme, how='left', + products_global = pd.merge(products_theme, representation_theme, how='left', on= ["representation_id", "category_id"]) - products_global = products_global.merge(events_theme, how='left', on='event_id', + products_global = pd.merge(products_global, events_theme, how='left', on='event_id', suffixes = ("_representation", "_event")) - products_global = order_columns_id(products_global) + products_purchased = pd.merge(ticket_information, products_global, left_on = 'product_id', right_on = 'id_products', how = 'inner') + + products_purchased_reduced = products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']] - # remove useless columns - products_global = products_global.drop(columns = ['type_of_id']) # 'name_events', 'name_seasons', 'name_categories' - return products_global \ No newline at end of file + return products_purchased_reduced \ No newline at end of file diff --git a/1_Descriptive_Statistics.ipynb b/1_Descriptive_Statistics.ipynb index e223399..920cdc2 100644 --- a/1_Descriptive_Statistics.ipynb +++ b/1_Descriptive_Statistics.ipynb @@ -615,19 +615,15 @@ "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " purchases = pd.read_csv(file_in, sep=\",\")\n", + " purchases = pd.read_csv(file_in, sep=\",\", parse_dates = ['purchase_date'])\n", " \n", - "purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True, format = 'ISO8601')\n", - "\n", "# Emails\n", "BUCKET = \"projet-bdc2324-team1\"\n", "FILE_KEY_S3 = \"0_Temp/Company 1 - Campaigns dataset clean.csv\"\n", "FILE_PATH_S3 = BUCKET + \"/\" + FILE_KEY_S3\n", "\n", "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " campaigns = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "campaigns['sent_at'] = pd.to_datetime(campaigns['sent_at'], utc = True, format = 'ISO8601')\n" + " campaigns = pd.read_csv(file_in, sep=\",\", parse_dates = ['sent_at'])\n" ] }, { @@ -818,7 +814,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 16, "id": "f663d68b-8a5c-4804-b31a-4477a03ca1e4", "metadata": { "scrolled": true @@ -906,7 +902,7 @@ "max 641981.000000 1.256574e+06" ] }, - "execution_count": 33, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -917,7 +913,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "d1212b10-3933-450a-b001-9e2cbf308f79", "metadata": {}, "outputs": [ @@ -1219,7 +1215,7 @@ "[1826672 rows x 15 columns]" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1238,7 +1234,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "dc45c1cd-2a78-48a6-aa2b-6a501254b6f2", "metadata": {}, "outputs": [ @@ -1458,7 +1454,7 @@ "[5 rows x 40 columns]" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1478,7 +1474,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "89fcb455-efb4-4ad4-ab88-efd6c8a76287", "metadata": {}, "outputs": [ @@ -1499,7 +1495,7 @@ " dtype='object')" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1510,7 +1506,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "d7b2356a-d5fc-4547-b3ff-fded0e304fb6", "metadata": {}, "outputs": [ @@ -1634,7 +1630,7 @@ "9 0.0 " ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1653,7 +1649,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "5559748f-1745-4651-a9f6-94702c7ee66f", "metadata": {}, "outputs": [ @@ -1813,7 +1809,7 @@ "max 434.000000 " ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1835,7 +1831,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "4971e35d-a762-4e18-9443-fd9571bd3f1e", "metadata": {}, "outputs": [ @@ -1864,7 +1860,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "bc65a711-d172-4839-b487-3047280fc3a6", "metadata": {}, "outputs": [ @@ -1894,7 +1890,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "c95cc35c-abfc-47c7-9b8a-ac69bfd60dd8", "metadata": {}, "outputs": [ @@ -1922,7 +1918,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "id": "49d5fd2d-9bc1-43ac-9270-1efd73759854", "metadata": {}, "outputs": [ @@ -1967,7 +1963,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "id": "e50e2583-4b8f-478e-87ac-591dde200af8", "metadata": {}, "outputs": [ @@ -1988,7 +1984,7 @@ " dtype='object')" ] }, - "execution_count": 25, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1999,7 +1995,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "id": "c724a315-9fe8-4874-be8f-a8115b17b5e2", "metadata": {}, "outputs": [], @@ -2021,7 +2017,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "id": "58af5dcb-673e-4f4d-ad5c-f66ce1e8a22c", "metadata": {}, "outputs": [ @@ -2042,7 +2038,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "id": "cc3437f7-8b36-4398-9da6-ff15e8e4c8d7", "metadata": {}, "outputs": [ diff --git a/Brouillon_AJ.ipynb b/Brouillon_AJ.ipynb deleted file mode 100644 index 8f5529a..0000000 --- a/Brouillon_AJ.ipynb +++ /dev/null @@ -1,695 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8c8e008c-9b92-41f1-88c1-8ec462e4ecab", - "metadata": {}, - "source": [ - "# Business Data Challenge - Team 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88af2795-8bf9-4df0-a059-be7c28fb4289", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "id": "e05cd2c9-3f76-48e3-b4a6-5055445af2e4", - "metadata": {}, - "source": [ - "Configuration de l'accès aux données" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ba1f385-2a2f-4b0c-be79-66f618469a9f", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import s3fs\n", - "# Create filesystem object\n", - "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", - "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", - "\n", - "BUCKET = \"bdc2324-data\"\n", - "fs.ls(BUCKET)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba9d04ad-6cc1-4bac-b1a0-44bedfb09763", - "metadata": {}, - "outputs": [], - "source": [ - "# Chargement des fichiers campaign_stats.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cacaecc1-4d8a-4e20-8cd3-b452cf17db56", - "metadata": {}, - "outputs": [], - "source": [ - "# Conversion des dates 'sent_at'\n", - "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n", - "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n", - "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ec4b583-dc64-43e9-b3ae-6bbaee0bc135", - "metadata": {}, - "outputs": [], - "source": [ - "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n", - "print(campaign_stats_1['sent_at'].max())\n", - "print(campaign_stats_1['sent_at'].min())\n", - "\n", - "print(campaign_stats_2['sent_at'].max())\n", - "print(campaign_stats_2['sent_at'].min())\n", - "\n", - "print(campaign_stats_3['sent_at'].max())\n", - "print(campaign_stats_3['sent_at'].min())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77894273-b3e5-4f29-bd63-9f4df8082b9b", - "metadata": {}, - "outputs": [], - "source": [ - "campaign_stats_1['sent_at']" - ] - }, - { - "cell_type": "markdown", - "id": "31f2edbf-5661-4516-9835-06d4da615c13", - "metadata": {}, - "source": [ - "### Customersplus.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4223c873-cbd3-46d1-ac96-c9a3b9e97092", - "metadata": {}, - "outputs": [], - "source": [ - "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " customers_plus_2 = pd.read_csv(file_in, sep=\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "460f853a-68c0-42a7-9877-b83d3aaec813", - "metadata": {}, - "outputs": [], - "source": [ - "customers_plus_1.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d5a9398f-72fc-4548-9f53-b20b372144b2", - "metadata": {}, - "outputs": [], - "source": [ - "customers_plus_1.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7467ddbe-0bd4-44cc-8a16-84aa41853638", - "metadata": {}, - "outputs": [], - "source": [ - "customers_plus_1['id'].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e15f05f8-3a89-4fc3-84a9-dae70e168440", - "metadata": {}, - "outputs": [], - "source": [ - "customers_plus_2['id'].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b40a653e-013f-48d0-8b57-0284587b36c5", - "metadata": {}, - "outputs": [], - "source": [ - "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32fa2215-3c79-40b5-8643-755865959fc7", - "metadata": {}, - "outputs": [], - "source": [ - "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n", - "# Exemple id commun = caractéristiques communes\n", - "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n", - "\n", - "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0eb345e4-69f5-4e16-ac57-e33674c6c43d", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "customers_plus_1.isna().mean()*100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f6ce60d-0912-497d-9108-330acccef394", - "metadata": {}, - "outputs": [], - "source": [ - "# Chargement de toutes les données\n", - "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n", - "\n", - "for nom_base in liste_base:\n", - " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n", - " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fa8ee17d-5092-40ac-8a0a-3790b016dd4e", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Jointure\n", - "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n", - "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n", - "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n", - "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n", - "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", - "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n", - "df_customer_event" - ] - }, - { - "cell_type": "markdown", - "id": "f1d4aeb8-ec74-4d49-989a-9116e01afe2f", - "metadata": {}, - "source": [ - "# Fusion et exploration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22bfad2b-d52a-4077-9b39-bee35004e01c", - "metadata": {}, - "outputs": [], - "source": [ - "# Jointure\n", - "var_choosed = ['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']\n", - "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[var_choosed]\n", - "\n", - "var_choosed.extend(['amount', 'is_full_price', 'representation_id'])\n", - "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[var_choosed]\n", - "\n", - "var_choosed.remove('representation_id')\n", - "var_choosed.extend(['start_date_time', 'event_id'])\n", - "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[var_choosed]\n", - "\n", - "var_choosed.remove('event_id')\n", - "var_choosed.extend(['name', 'customer_id'])\n", - "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[var_choosed]\n", - "\n", - "# Changement de nom\n", - "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", - "var_choosed[var_choosed.index('name')] = \"event_name\"\n", - "\n", - "# Base finale\n", - "var_choosed.extend(['age', 'gender', 'country', 'fidelity', 'profession'])\n", - "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[var_choosed]\n", - "df_customer_event" - ] - }, - { - "cell_type": "markdown", - "id": "4cb08d7a-ff04-4951-863d-20aaf33f0b31", - "metadata": {}, - "source": [ - "## Type de client au globale" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f47ba14a-8601-4b91-9712-223a5ed8a1d1", - "metadata": {}, - "outputs": [], - "source": [ - "# Client\n", - "print(customer_target_mappings.columns)\n", - "print(customer_target_mappings.shape)\n", - "customer_target_mappings.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f11f829e-66b1-4fd0-a46f-5ae7cb78073f", - "metadata": {}, - "outputs": [], - "source": [ - "customer_target_mappings['extra_field'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c240ab80-c746-4a64-ac6a-be8382c4f0ec", - "metadata": {}, - "outputs": [], - "source": [ - "customer_target_mappings['name'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c03c0597-3f21-4673-8a0f-24d7d9bc5ce4", - "metadata": {}, - "outputs": [], - "source": [ - "# Segmentation existante\n", - "print(target_types.columns)\n", - "print(target_types.shape)\n", - "target_types.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5adb1773-648d-4683-bc08-d1f2298c1283", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "target_types" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3d65f74e-47fc-4296-b493-a1ebefb91cde", - "metadata": {}, - "outputs": [], - "source": [ - "# Tags = clients\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " tags = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(tags.columns)\n", - "print(tags.shape)\n", - "tags.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a689a63-165b-4c4e-bbb0-695b661048d9", - "metadata": {}, - "outputs": [], - "source": [ - "tags" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "69e38c52-0570-4531-aebb-9deb6db8c40b", - "metadata": {}, - "outputs": [], - "source": [ - "# Structure = clients\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(structure_tag_mappings.columns)\n", - "print(structure_tag_mappings.shape)\n", - "structure_tag_mappings.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74dc34ad-375b-48df-a900-40d92c5fff13", - "metadata": {}, - "outputs": [], - "source": [ - "structure_tag_mappings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a479ceeb-0135-4899-9cbc-90ed7bf941fe", - "metadata": {}, - "outputs": [], - "source": [ - "# Tags = clients\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " customersplus = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(customersplus.columns)\n", - "print(customersplus.shape)\n", - "customersplus.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "383e892c-606a-45ce-bdd6-b503b3e0be33", - "metadata": {}, - "outputs": [], - "source": [ - "customersplus" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70324d06-b855-4386-a7de-eef1eb13dfdf", - "metadata": {}, - "outputs": [], - "source": [ - "# But : lier les caractéristiques socio-demo et les comportements d'achat\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4bbd743d-51fe-4786-8ad3-5a4a4d09439c", - "metadata": {}, - "outputs": [], - "source": [ - "# tickets\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11tickets.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " tickets = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(tickets.columns)\n", - "print(tickets.shape)\n", - "tickets.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea83ea5c-3d47-4a66-a523-04b69b149a20", - "metadata": {}, - "outputs": [], - "source": [ - "tickets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ba15708e-eb84-4b5d-a86c-05ebed188cf6", - "metadata": {}, - "outputs": [], - "source": [ - "tickets['type_of'].unique()" - ] - }, - { - "cell_type": "markdown", - "id": "bc192b08-30a5-486a-8bea-93e765dbfce6", - "metadata": {}, - "source": [ - "## Types d'évenement et client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e14dcf62-2def-4ed5-834b-cf21abbc2894", - "metadata": {}, - "outputs": [], - "source": [ - "# Evenement = events.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11events.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " events = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(events.columns)\n", - "print(events.shape)\n", - "events.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1a1d63c-d7de-4b63-93a8-1c734eb5b316", - "metadata": {}, - "outputs": [], - "source": [ - "events" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af80eee8-f717-4159-a0fd-09d47ec96621", - "metadata": {}, - "outputs": [], - "source": [ - "events['name'].nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6afc6f3d-4292-4a92-a4d6-14f1edc25df2", - "metadata": {}, - "outputs": [], - "source": [ - "# Représentation des évenements = representations.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11representations.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " representations = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(representations.columns)\n", - "print(representations.shape)\n", - "representations.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1487402a-a49b-4737-b7d7-40c764d2f0b4", - "metadata": {}, - "outputs": [], - "source": [ - "representations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99b27418-2c15-4a6e-bcf5-d329ca492085", - "metadata": {}, - "outputs": [], - "source": [ - "# Produits vendues = products.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11products.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " products = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(products.columns)\n", - "print(products.shape)\n", - "products.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c49bcd47-672f-4e0f-aee9-a7475151b97f", - "metadata": {}, - "outputs": [], - "source": [ - "products" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4aec5ce-d0c9-4625-bb29-9ac154818621", - "metadata": {}, - "outputs": [], - "source": [ - "# Lieu = facilities.csv\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11facilities.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " facilities = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(facilities.columns)\n", - "print(facilities.shape)\n", - "facilities.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b3642483-2879-442a-ad69-efcd2331a200", - "metadata": {}, - "outputs": [], - "source": [ - "facilities" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "da1e9807-2a8d-4be7-a785-55cffd734f36", - "metadata": {}, - "outputs": [], - "source": [ - "# Saisons = seasons.csv période sur deux années consécutives\n", - "FILE_PATH_S3 = 'bdc2324-data/11/11seasons.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " seasons = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(seasons.columns)\n", - "print(seasons.shape)\n", - "seasons.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ec8a37b5-2d78-4b1c-aa47-bd923fdc2ba9", - "metadata": {}, - "outputs": [], - "source": [ - "seasons['name'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "abb3aa20-774b-4761-983a-df5eb2bc51c6", - "metadata": {}, - "outputs": [], - "source": [ - "# Achats = purchases.csv \n", - "FILE_PATH_S3 = 'bdc2324-data/11/11purchases.csv'\n", - "\n", - "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " purchases = pd.read_csv(file_in, sep=\",\")\n", - "\n", - "print(purchases.columns)\n", - "print(purchases.shape)\n", - "purchases.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "30e204ab-4f63-430c-a818-5c8035b6e17b", - "metadata": {}, - "outputs": [], - "source": [ - "purchases" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb index bec456e..d697ff5 100644 --- a/Exploration_billet_AJ.ipynb +++ b/Exploration_billet_AJ.ipynb @@ -124,9 +124,7 @@ { "cell_type": "markdown", "id": "e855f403", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, + "metadata": {}, "source": [ "## customersplus.csv" ] @@ -1289,7 +1287,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.6" } }, "nbformat": 4,