Suppression des notebooks exploratoires et brouillons

2024-04-09 20:20:57 +00:00 · 2024-04-09 20:20:57 +00:00 · 4ed6bd809d
commit 4ed6bd809d
parent 9ca22fb9e7
30 changed files with 0 additions and 88609 deletions
--- a/Descriptive_statistics/debug.ipynb
+++ b/Descriptive_statistics/debug.ipynb
--- a/Descriptive_statistics/generate_stat_desc.py
+++ b/Descriptive_statistics/generate_stat_desc.py
@ -1,68 +0,0 @@
 import pandas as pd
 import numpy as np
 import os
 import io
 import s3fs
 import re
 import warnings
 # Ignore warning
 warnings.filterwarnings('ignore')
 exec(open('../0_KPI_functions.py').read())
 exec(open('plot.py').read())
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 companies = {'musee' : ['1', '2', '3', '4'], # , '101'
            'sport': ['5'],
            'musique' : ['10', '11', '12', '13', '14']}
 type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
 list_of_comp = companies[type_of_activity] 
 # Load files
 customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)
 # Identify anonymous customer for each company and remove them from our datasets
 outlier_list = outlier_detection(tickets, list_of_comp)
 # Identify valid customer (customer who bought tickets after starting date or received mails after starting date)
 customer_valid_list = valid_customer_detection(products, campaigns_brut)
 databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]
 for dataset in databases:
    dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier
    dataset = dataset[dataset['customer_id'].isin(customer_valid_list)] # keep only valid customer
    #print(f'shape of {dataset} : ', dataset.shape)
 # Identify customer who bought during the period of y
 customer_target_period = identify_purchase_during_target_periode(products)
 customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(customer_target_period), 1, 0)
 # Generate graph and automatically saved them in the bucket
 compute_nb_clients(customer, type_of_activity)
 maximum_price_paid(customer, type_of_activity)
 mailing_consent(customer, type_of_activity)
 mailing_consent_by_target(customer)
 gender_bar(customer, type_of_activity)
 country_bar(customer, type_of_activity)
 lazy_customer_plot(campaigns_kpi, type_of_activity)
 #campaigns_effectiveness(customer, type_of_activity)
 sale_dynamics(products, campaigns_brut, type_of_activity)
 tickets_internet(tickets, type_of_activity)
 box_plot_price_tickets(tickets, type_of_activity)
--- a/Descriptive_statistics/plot.py
+++ b/Descriptive_statistics/plot.py
@ -1,328 +0,0 @@
 import pandas as pd
 import os
 import s3fs
 import io
 import warnings
 from datetime import date, timedelta, datetime
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.dates as mdates
 import seaborn as sns
 def load_files(nb_compagnie):
    customer = pd.DataFrame()
    campaigns_brut = pd.DataFrame()
    campaigns_kpi = pd.DataFrame()
    products = pd.DataFrame()
    tickets = pd.DataFrame()
    # début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle
    for directory_path in nb_compagnie:
        df_customerplus_clean_0 = display_databases(directory_path, file_name = "customerplus_cleaned")
        df_campaigns_brut = display_databases(directory_path, file_name = "campaigns_information", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])
        df_products_purchased_reduced = display_databases(directory_path, file_name = "products_purchased_reduced", datetime_col = ['purchase_date'])
        df_target_information = display_databases(directory_path, file_name = "target_information")
        df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_brut) 
        df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)
        df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)
    # creation de la colonne Number compagnie, qui permettra d'agréger les résultats
        df_tickets_kpi["number_company"]=int(directory_path)
        df_campaigns_brut["number_company"]=int(directory_path)
        df_campaigns_kpi["number_company"]=int(directory_path)
        df_customerplus_clean["number_company"]=int(directory_path)
        df_target_information["number_company"]=int(directory_path)
    # Traitement des index
        df_tickets_kpi["customer_id"]= directory_path + '_' +  df_tickets_kpi['customer_id'].astype('str')
        df_campaigns_brut["customer_id"]= directory_path + '_' +  df_campaigns_brut['customer_id'].astype('str')
        df_campaigns_kpi["customer_id"]= directory_path + '_' +  df_campaigns_kpi['customer_id'].astype('str') 
        df_customerplus_clean["customer_id"]= directory_path + '_' +  df_customerplus_clean['customer_id'].astype('str') 
        df_products_purchased_reduced["customer_id"]= directory_path + '_' +  df_products_purchased_reduced['customer_id'].astype('str') 
    # Concaténation
        customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
        campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
        campaigns_brut = pd.concat([campaigns_brut, df_campaigns_brut], ignore_index=True) 
        tickets = pd.concat([tickets, df_tickets_kpi], ignore_index=True)
        products = pd.concat([products, df_products_purchased_reduced], ignore_index=True)
    return customer, campaigns_kpi, campaigns_brut, tickets, products
 def save_file_s3(File_name, type_of_activity):
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png')
    image_buffer.seek(0)
    FILE_PATH = f"projet-bdc2324-team1/stat_desc/{type_of_activity}/"
    FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
        s3_file.write(image_buffer.read())
    plt.close()
 def outlier_detection(tickets, company_list, show_diagram=False):
    outlier_list = list()
    for company in company_list:
        total_amount_share = tickets[tickets['number_company']==int(company)].groupby('customer_id')['total_amount'].sum().reset_index()
        total_amount_share['CA'] = total_amount_share['total_amount'].sum()
        total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']
        total_amount_share_index = total_amount_share.set_index('customer_id')
        df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)
        #print('df circulaire : ', df_circulaire.head())
        top = df_circulaire[:1]
        #print('top : ', top)
        outlier_list.append(top.index[0])
        rest = df_circulaire[1:]
        rest_sum = rest.sum()
        new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])
        if show_diagram:
            plt.figure(figsize=(3, 3))
            plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)
            plt.axis('equal')
            plt.title(f'Répartition des montants totaux pour la compagnie {company}')
            plt.show()
    return outlier_list
 def valid_customer_detection(products, campaigns_brut):
    products_valid = products[products['purchase_date']>="2021-05-01"]
    consumer_valid_product = products_valid['customer_id'].to_list()
    campaigns_valid = campaigns_brut[campaigns_brut["sent_at"]>="2021-05-01"]
    consumer_valid_campaigns = campaigns_valid['customer_id'].to_list()
    consumer_valid = consumer_valid_product + consumer_valid_campaigns
    return consumer_valid 
 def identify_purchase_during_target_periode(products):
    products_target_period = products[(products['purchase_date']>="2022-11-01")
    & (products['purchase_date']<="2023-11-01")]
    customer_target_period = products_target_period['customer_id'].to_list()
    return customer_target_period
 def remove_elements(lst, elements_to_remove):
    return ''.join([x for x in lst if x not in elements_to_remove])
 def compute_nb_clients(customer, type_of_activity):
    company_nb_clients = customer[customer["purchase_count"]>0].groupby("number_company")["customer_id"].count().reset_index()
    plt.bar(company_nb_clients["number_company"], company_nb_clients["customer_id"]/1000)
    plt.xlabel('Company')
    plt.ylabel("Number of clients (thousands)")
    plt.title(f"Number of clients for {type_of_activity}")
    plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
    plt.show()
    save_file_s3("nb_clients_", type_of_activity)
 def maximum_price_paid(customer, type_of_activity):
    company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
    plt.bar(company_max_price["number_company"], company_max_price["max_price"])
    plt.xlabel('Company')
    plt.ylabel("Maximal price of a ticket Prix")
    plt.title(f"Maximal price of a ticket for {type_of_activity}")
    plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
    plt.show()
    save_file_s3("Maximal_price_", type_of_activity)
 def mailing_consent(customer, type_of_activity):
    mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
    plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
    plt.xlabel('Company')
    plt.ylabel('Consent')
    plt.title(f'Consent of mailing for {type_of_activity}')
    plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
    plt.show()
    save_file_s3("mailing_consent_", type_of_activity)
 def mailing_consent_by_target(customer):
    df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index()
    # Création du barplot groupé
    fig, ax = plt.subplots(figsize=(10, 6))
    categories = df_graph["number_company"].unique()
    bar_width = 0.35
    bar_positions = np.arange(len(categories))
    # Grouper les données par label et créer les barres groupées
    for label in df_graph["has_purchased_target_period"].unique():
        label_data = df_graph[df_graph['has_purchased_target_period'] == label]
        values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
        label_printed = "purchased" if label else "no purchase"
        ax.bar(bar_positions, values, bar_width, label=label_printed)
        # Mise à jour des positions des barres pour le prochain groupe
        bar_positions = [pos + bar_width for pos in bar_positions]
    # Ajout des étiquettes, de la légende, etc.
    ax.set_xlabel('Company')
    ax.set_ylabel('Consent')
    ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
    ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
    ax.set_xticklabels(categories)
    ax.legend()
    # Affichage du plot
    plt.show()
    save_file_s3("mailing_consent_target_", type_of_activity)
 def gender_bar(customer, type_of_activity):
    company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
    # Création du barplot
    plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
    plt.bar(company_genders["number_company"], company_genders["gender_female"], 
            bottom = company_genders["gender_male"], label = "Femme")
    plt.bar(company_genders["number_company"], company_genders["gender_other"], 
            bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
    plt.xlabel('Company')
    plt.ylabel("Gender")
    plt.title(f"Gender of Customer for {type_of_activity}")
    plt.legend()
    plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
    plt.show()
    save_file_s3("gender_bar_", type_of_activity)
 def country_bar(customer, type_of_activity):
    company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
    plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
    plt.xlabel('Company')
    plt.ylabel("Share of French Customer")
    plt.title(f"Share of French Customer for {type_of_activity}")
    plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
    plt.show()
    save_file_s3("country_bar_", type_of_activity)
 def lazy_customer_plot(campaigns_kpi, type_of_activity):
    company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
    plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
    plt.xlabel('Company')
    plt.ylabel("Share of Customers who did not open mail")
    plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
    plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
    plt.show()
    save_file_s3("lazy_customer_", type_of_activity)
 def campaigns_effectiveness(customer, type_of_activity):
    campaigns_effectiveness = customer.groupby("number_company")["opt_in"].mean().reset_index()
    plt.bar(campaigns_effectiveness["number_company"], campaigns_effectiveness["opt_in"])
    plt.xlabel('Company')
    plt.ylabel("Number of Customers (thousands)")
    plt.title(f"Number of Customers of have bought or have received mails for {type_of_activity}")
    plt.legend()
    plt.xticks(campaigns_effectiveness["number_company"], ["{}".format(i) for i in campaigns_effectiveness["number_company"]])
    plt.show()
    save_file_s3("campaigns_effectiveness_", type_of_activity)
 def sale_dynamics(products, campaigns_brut, type_of_activity):
    purchase_min = products.groupby(['customer_id'])['purchase_date'].min().reset_index()
    purchase_min.rename(columns = {'purchase_date' : 'first_purchase_event'}, inplace = True)
    purchase_min['first_purchase_event'] = pd.to_datetime(purchase_min['first_purchase_event'])
    purchase_min['first_purchase_month'] = pd.to_datetime(purchase_min['first_purchase_event'].dt.strftime('%Y-%m'))
    # Mois du premier mails
    first_mail_received = campaigns_brut.groupby('customer_id')['sent_at'].min().reset_index()
    first_mail_received.rename(columns = {'sent_at' : 'first_email_reception'}, inplace = True)
    first_mail_received['first_email_reception'] = pd.to_datetime(first_mail_received['first_email_reception'])
    first_mail_received['first_email_month'] = pd.to_datetime(first_mail_received['first_email_reception'].dt.strftime('%Y-%m'))
    # Fusion 
    known_customer = pd.merge(purchase_min[['customer_id', 'first_purchase_month']], 
                      first_mail_received[['customer_id', 'first_email_month']], on = 'customer_id', how = 'outer')
    # Mois à partir duquel le client est considere comme connu
    known_customer['known_date'] = pd.to_datetime(known_customer[['first_email_month', 'first_purchase_month']].min(axis = 1), utc = True, format = 'ISO8601')
    # Nombre de commande par mois
    purchases_count = pd.merge(products[['customer_id', 'purchase_id', 'purchase_date']].drop_duplicates(), known_customer[['customer_id', 'known_date']], on = ['customer_id'], how = 'inner')
    purchases_count['is_customer_known'] = purchases_count['purchase_date'] > purchases_count['known_date'] + pd.DateOffset(months=1)
    purchases_count['purchase_date_month'] = pd.to_datetime(purchases_count['purchase_date'].dt.strftime('%Y-%m'))
    purchases_count = purchases_count[purchases_count['customer_id'] != 1]
    # Nombre de commande par mois par type de client
    nb_purchases_graph = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['purchase_id'].count().reset_index()
    nb_purchases_graph.rename(columns = {'purchase_id' : 'nb_purchases'}, inplace = True)
    nb_purchases_graph_2 = purchases_count.groupby(['purchase_date_month', 'is_customer_known'])['customer_id'].nunique().reset_index()
    nb_purchases_graph_2.rename(columns = {'customer_id' : 'nb_new_customer'}, inplace = True)
    # Graphique en nombre de commande
    purchases_graph = nb_purchases_graph
    purchases_graph_used = purchases_graph[purchases_graph["purchase_date_month"] >= datetime(2021,3,1)]
    purchases_graph_used_0 = purchases_graph_used[purchases_graph_used["is_customer_known"]==False]
    purchases_graph_used_1 = purchases_graph_used[purchases_graph_used["is_customer_known"]==True]
    merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
    plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"], 
            bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
    # commande pr afficher slt
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b%y'))
    plt.xlabel('Month')
    plt.ylabel("Number of Sales")
    plt.title(f"Number of Sales for {type_of_activity}")
    plt.legend()
    plt.show()
    save_file_s3("sale_dynamics_", type_of_activity)
 def tickets_internet(tickets, type_of_activity):
    nb_tickets_internet = tickets.groupby("number_company")[["nb_tickets", "nb_tickets_internet"]].sum().reset_index()
    nb_tickets_internet["Share_ticket_internet"] = nb_tickets_internet["nb_tickets_internet"]*100 / nb_tickets_internet["nb_tickets"]
    plt.bar(nb_tickets_internet["number_company"],  nb_tickets_internet["Share_ticket_internet"])
    plt.xlabel('Company')
    plt.ylabel("Share of Tickets Bought Online")
    plt.title(f"Share of Tickets Bought Online for {type_of_activity}")
    plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
    plt.show()
    save_file_s3("tickets_internet_", type_of_activity)
 def box_plot_price_tickets(tickets, type_of_activity):
    price_tickets = tickets[(tickets['total_amount'] > 0)]
    sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
    plt.title(f"Box plot of price tickets for {type_of_activity}")
    plt.xticks(price_tickets["number_company"], ["{}".format(i) for i in price_tickets["number_company"]])
    plt.show()
    save_file_s3("box_plot_price_tickets_", type_of_activity)
--- a/Spectacle/2_Modelization_spectacle.ipynb
+++ b/Spectacle/2_Modelization_spectacle.ipynb
--- a/Spectacle/2_bis_logit_baseline_statsmodels.ipynb
+++ b/Spectacle/2_bis_logit_baseline_statsmodels.ipynb
--- a/Spectacle/Exploration_spectacle.ipynb
+++ b/Spectacle/Exploration_spectacle.ipynb
--- a/Spectacle/Stat_desc.ipynb
+++ b/Spectacle/Stat_desc.ipynb
--- a/Sport/Descriptive_statistics/stat_desc_sport.ipynb
+++ b/Sport/Descriptive_statistics/stat_desc_sport.ipynb
--- a/Sport/Modelization/2_Modelization_sport.ipynb
+++ b/Sport/Modelization/2_Modelization_sport.ipynb
--- a/Sport/Modelization/3_logit_cross_val_sport.ipynb
+++ b/Sport/Modelization/3_logit_cross_val_sport.ipynb
--- a/Sport/Modelization/3_model_cv_sport+CA.ipynb
+++ b/Sport/Modelization/3_model_cv_sport+CA.ipynb
--- a/Sport/Modelization/CA_segment_sport.ipynb
+++ b/Sport/Modelization/CA_segment_sport.ipynb
--- a/Sport/Modelization/segment_analysis_sport_0_6.ipynb
+++ b/Sport/Modelization/segment_analysis_sport_0_6.ipynb
--- a/Sport/exploration_sport.ipynb
+++ b/Sport/exploration_sport.ipynb
--- a/exploratory_analysis/TP_exploratory_analysis-Copy1.ipynb
+++ b/exploratory_analysis/TP_exploratory_analysis-Copy1.ipynb
--- a/notebooks_merge/TP_merge_target_campaigns_links.ipynb
+++ b/notebooks_merge/TP_merge_target_campaigns_links.ipynb
--- a/useless/0_Cleaning_and_merge.ipynb
+++ b/useless/0_Cleaning_and_merge.ipynb
--- a/useless/1_Descriptive_Statistics.ipynb
+++ b/useless/1_Descriptive_Statistics.ipynb
--- a/useless/2_Regression_logistique.ipynb
+++ b/useless/2_Regression_logistique.ipynb
--- a/useless/2_modelisation_pipeline+visu.ipynb
+++ b/useless/2_modelisation_pipeline+visu.ipynb
--- a/useless/Computes_log_coeff.ipynb
+++ b/useless/Computes_log_coeff.ipynb
@ -1,436 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "135a67de-cff8-4345-bacc-d9f9fa68a41f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import s3fs\n",
    "import re\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n",
    "from sklearn.utils import class_weight\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n",
    "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
    "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
    "\n",
    "import statsmodels.api as sm\n",
    "\n",
    "import pickle\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9a6254df-d496-4957-89ea-9ed2b74049dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "922cf05f-8343-4ed0-ad62-3ef1f17c0730",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_train_test():\n",
    "    BUCKET = \"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/musee\"\n",
    "    File_path_train = BUCKET + \"/Train_set.csv\"\n",
    "    File_path_test = BUCKET + \"/Test_set.csv\"\n",
    "    \n",
    "    with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
    "        dataset_train = pd.read_csv(file_in, sep=\",\")\n",
    "        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
    "\n",
    "    with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
    "        dataset_test = pd.read_csv(file_in, sep=\",\")\n",
    "        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
    "    \n",
    "    return dataset_train, dataset_test\n",
    "\n",
    "\n",
    "def features_target_split(dataset_train, dataset_test):\n",
    "    features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',\n",
    "                           'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',\n",
    "                            'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',\n",
    "                            'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',\n",
    "                  'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',\n",
    "                           'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',\n",
    "                           'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',\n",
    "                 'target_jeune', 'target_abonne']\n",
    "    X_train = dataset_train[features_l]\n",
    "    y_train = dataset_train[['y_has_purchased']]\n",
    "\n",
    "    X_test = dataset_test[features_l]\n",
    "    y_test = dataset_test[['y_has_purchased']]\n",
    "    return X_train, X_test, y_train, y_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2584e454-111b-4c39-881b-676841cb5aa1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_498/3950829189.py:7: DtypeWarning: Columns (10,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  dataset_train = pd.read_csv(file_in, sep=\",\")\n",
      "/tmp/ipykernel_498/3950829189.py:11: DtypeWarning: Columns (10,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  dataset_test = pd.read_csv(file_in, sep=\",\")\n"
     ]
    }
   ],
   "source": [
    "dataset_train, dataset_test = load_train_test()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a32ea7f8-e2d3-44db-8937-5afda9447b58",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "3bdc8840-7f45-416f-8ee0-307db201c496",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "const                      0\n",
       "nb_campaigns               0\n",
       "taux_ouverture_mail        0\n",
       "prop_purchases_internet    0\n",
       "nb_tickets                 0\n",
       "nb_purchases               0\n",
       "total_amount               0\n",
       "nb_suppliers               0\n",
       "time_to_open               0\n",
       "purchases_10_2021          0\n",
       "purchases_10_2022          0\n",
       "purchases_11_2021          0\n",
       "purchases_12_2021          0\n",
       "purchases_1_2022           0\n",
       "purchases_2_2022           0\n",
       "purchases_3_2022           0\n",
       "purchases_4_2022           0\n",
       "purchases_5_2021           0\n",
       "purchases_5_2022           0\n",
       "purchases_6_2021           0\n",
       "purchases_6_2022           0\n",
       "purchases_7_2021           0\n",
       "purchases_7_2022           0\n",
       "purchases_8_2021           0\n",
       "purchases_8_2022           0\n",
       "purchases_9_2021           0\n",
       "purchases_9_2022           0\n",
       "purchase_date_min          0\n",
       "purchase_date_max          0\n",
       "nb_targets                 0\n",
       "gender_female              0\n",
       "gender_male                0\n",
       "achat_internet             0\n",
       "categorie_age_0_10         0\n",
       "categorie_age_10_20        0\n",
       "categorie_age_20_30        0\n",
       "categorie_age_30_40        0\n",
       "categorie_age_40_50        0\n",
       "categorie_age_50_60        0\n",
       "categorie_age_60_70        0\n",
       "categorie_age_70_80        0\n",
       "categorie_age_plus_80      0\n",
       "categorie_age_inconnue     0\n",
       "country_fr                 0\n",
       "is_profession_known        0\n",
       "is_zipcode_known           0\n",
       "opt_in                     0\n",
       "target_optin               0\n",
       "target_newsletter          0\n",
       "target_scolaire            0\n",
       "target_entreprise          0\n",
       "target_famille             0\n",
       "target_jeune               0\n",
       "target_abonne              0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.isna().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "3c3ac545-52e0-4d0c-afdc-fff70f468a94",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "most_frequent_value = X_train['country_fr'].mode()[0]\n",
    "most_frequent_value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "0fcdc5ee-bcea-4436-be9b-92b79d27a230",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train['country_fr'] = X_train['country_fr'].fillna(most_frequent_value)\n",
    "X_train['time_to_open'] = X_train['time_to_open'].fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7ecdaf1a-b5e4-4880-871e-363eae6fe4e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n",
    "                                            y = y_train['y_has_purchased'])\n",
    "\n",
    "weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a6b56090-cfe9-4772-810c-d36bf12aceca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0.52239696, 0.52239696, 0.52239696, ..., 0.52239696, 0.52239696,\n",
       "       0.52239696])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "class_counts = np.bincount(y_train['y_has_purchased'])\n",
    "class_weights = len(y_train['y_has_purchased']) / (2 * class_counts)\n",
    "\n",
    "weights = class_weights[y_train['y_has_purchased'].values.astype(int)]\n",
    "weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "bfaea23e-7d7a-4c0d-96f6-4ab4c7c2ff51",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = sm.add_constant(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "4cf97ae5-9dcf-4f4c-91b3-3b1f339a6213",
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
    "                           'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',\n",
    "                            'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',\n",
    "                            'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "debb36df-3c2f-4cf7-83a9-ad6e4f6b0470",
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = StandardScaler()\n",
    "\n",
    "X_train_scaled_columns = scaler.fit_transform(X_train[numeric_features])\n",
    "\n",
    "X_train_scaled = X_train.copy()  #\n",
    "X_train_scaled[numeric_features] = X_train_scaled_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "7eaa6160-20a0-4a78-ac38-0411e19707ed",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/mamba/lib/python3.11/site-packages/statsmodels/base/optimizer.py:18: FutureWarning: Keyword arguments have been passed to the optimizer that have no effect. The list of allowed keyword arguments for method newton is: tol, ridge_factor. The list of unsupported keyword arguments passed include: weights. After release 0.14, this will raise.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Optimization terminated successfully.\n",
      "         Current function value: 0.136180\n",
      "         Iterations 9\n",
      "                           Logit Regression Results                           \n",
      "==============================================================================\n",
      "Dep. Variable:        y_has_purchased   No. Observations:               434278\n",
      "Model:                          Logit   Df Residuals:                   434226\n",
      "Method:                           MLE   Df Model:                           51\n",
      "Date:                Thu, 04 Apr 2024   Pseudo R-squ.:                  0.2305\n",
      "Time:                        06:09:09   Log-Likelihood:                -59140.\n",
      "converged:                       True   LL-Null:                       -76855.\n",
      "Covariance Type:            nonrobust   LLR p-value:                     0.000\n",
      "===========================================================================================\n",
      "                              coef    std err          z      P>|z|      [0.025      0.975]\n",
      "-------------------------------------------------------------------------------------------\n",
      "const                      -4.0679   1.65e+06  -2.46e-06      1.000   -3.24e+06    3.24e+06\n",
      "nb_campaigns                0.0916      0.012      7.352      0.000       0.067       0.116\n",
      "taux_ouverture_mail         0.0012      0.011      0.106      0.916      -0.021       0.023\n",
      "prop_purchases_internet    -0.1995      0.067     -2.972      0.003      -0.331      -0.068\n",
      "nb_tickets                  0.5956      0.193      3.091      0.002       0.218       0.973\n",
      "nb_purchases                0.1598   1.71e+06   9.37e-08      1.000   -3.34e+06    3.34e+06\n",
      "total_amount               -0.1938      0.071     -2.724      0.006      -0.333      -0.054\n",
      "nb_suppliers                0.0282      0.021      1.348      0.178      -0.013       0.069\n",
      "time_to_open                0.2785      0.018     15.534      0.000       0.243       0.314\n",
      "purchases_10_2021           0.0417   4.76e+04   8.76e-07      1.000   -9.34e+04    9.34e+04\n",
      "purchases_10_2022           0.4578   2.72e+05   1.68e-06      1.000   -5.33e+05    5.33e+05\n",
      "purchases_11_2021           0.0252   4.92e+04   5.12e-07      1.000   -9.65e+04    9.65e+04\n",
      "purchases_12_2021           0.0221    6.3e+04    3.5e-07      1.000   -1.24e+05    1.24e+05\n",
      "purchases_1_2022            0.0083   5.49e+04   1.52e-07      1.000   -1.08e+05    1.08e+05\n",
      "purchases_2_2022            0.0462   7.59e+04   6.09e-07      1.000   -1.49e+05    1.49e+05\n",
      "purchases_3_2022            0.0928   1.07e+05   8.67e-07      1.000    -2.1e+05     2.1e+05\n",
      "purchases_4_2022            0.1446   1.65e+05   8.75e-07      1.000   -3.24e+05    3.24e+05\n",
      "purchases_5_2021           -0.0427   4.84e+04  -8.83e-07      1.000   -9.48e+04    9.48e+04\n",
      "purchases_5_2022            0.1412   1.67e+05   8.46e-07      1.000   -3.27e+05    3.27e+05\n",
      "purchases_6_2021           -0.0252   5.55e+04  -4.54e-07      1.000   -1.09e+05    1.09e+05\n",
      "purchases_6_2022            0.1246   1.84e+05   6.77e-07      1.000    -3.6e+05     3.6e+05\n",
      "purchases_7_2021           -0.0252   5.55e+04  -4.55e-07      1.000   -1.09e+05    1.09e+05\n",
      "purchases_7_2022           -0.0074    2.1e+05  -3.54e-08      1.000   -4.12e+05    4.12e+05\n",
      "purchases_8_2021            0.0116   5.26e+04   2.21e-07      1.000   -1.03e+05    1.03e+05\n",
      "purchases_8_2022            0.0554    2.4e+05   2.31e-07      1.000    -4.7e+05     4.7e+05\n",
      "purchases_9_2021           -0.0320   5.47e+04  -5.85e-07      1.000   -1.07e+05    1.07e+05\n",
      "purchases_9_2022            0.2349    2.2e+05   1.07e-06      1.000   -4.32e+05    4.32e+05\n",
      "purchase_date_min           0.0781      0.025      3.092      0.002       0.029       0.128\n",
      "purchase_date_max          -0.5228      0.026    -20.021      0.000      -0.574      -0.472\n",
      "nb_targets                  0.7083      0.010     74.555      0.000       0.690       0.727\n",
      "gender_female               0.2961      0.038      7.701      0.000       0.221       0.371\n",
      "gender_male                 0.0450      0.040      1.137      0.256      -0.033       0.123\n",
      "achat_internet              0.1869      0.158      1.186      0.236      -0.122       0.496\n",
      "categorie_age_0_10         -0.2713   1.65e+06  -1.64e-07      1.000   -3.24e+06    3.24e+06\n",
      "categorie_age_10_20        -0.1238   1.65e+06  -7.48e-08      1.000   -3.24e+06    3.24e+06\n",
      "categorie_age_20_30        -0.6322   1.65e+06  -3.82e-07      1.000   -3.24e+06    3.24e+06\n",
      "categorie_age_30_40        -0.5004   1.65e+06  -3.02e-07      1.000   -3.24e+06    3.24e+06\n",
      "categorie_age_40_50        -0.4020   1.65e+06  -2.43e-07      1.000   -3.24e+06    3.24e+06\n",
      "categorie_age_50_60        -0.4101   1.65e+06  -2.48e-07      1.000   -3.24e+06    3.24e+06\n",
      "categorie_age_60_70        -0.3232   1.65e+06  -1.95e-07      1.000   -3.24e+06    3.24e+06\n",
      "categorie_age_70_80        -0.1635   1.65e+06  -9.88e-08      1.000   -3.24e+06    3.24e+06\n",
      "categorie_age_plus_80      -0.4677   1.65e+06  -2.83e-07      1.000   -3.24e+06    3.24e+06\n",
      "categorie_age_inconnue     -0.7737   1.65e+06  -4.68e-07      1.000   -3.24e+06    3.24e+06\n",
      "country_fr                  0.7419      0.065     11.422      0.000       0.615       0.869\n",
      "is_profession_known        -0.5947      0.066     -9.074      0.000      -0.723      -0.466\n",
      "is_zipcode_known            1.1374      0.027     41.609      0.000       1.084       1.191\n",
      "opt_in                     -1.0658      0.030    -35.485      0.000      -1.125      -1.007\n",
      "target_optin                0.5946      0.034     17.361      0.000       0.527       0.662\n",
      "target_newsletter          -1.0237      0.035    -29.411      0.000      -1.092      -0.955\n",
      "target_scolaire             0.0428      0.036      1.188      0.235      -0.028       0.113\n",
      "target_entreprise          -0.2645      0.058     -4.589      0.000      -0.377      -0.152\n",
      "target_famille              0.5035      0.035     14.548      0.000       0.436       0.571\n",
      "target_jeune               -0.6795      0.029    -23.590      0.000      -0.736      -0.623\n",
      "target_abonne               0.0677      0.037      1.833      0.067      -0.005       0.140\n",
      "===========================================================================================\n"
     ]
    }
   ],
   "source": [
    "model_logit = sm.Logit(y_train, X_train_scaled)\n",
    "\n",
    "result = model_logit.fit(weights=weights)\n",
    "\n",
    "print(result.summary())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75dc92c7-cc1e-40f1-bc74-0b04043b7e44",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/useless/Exploration_billet_AJ.ipynb
+++ b/useless/Exploration_billet_AJ.ipynb
--- a/useless/Identification_entreprise.ipynb
+++ b/useless/Identification_entreprise.ipynb
--- a/useless/Notebook_AR.ipynb
+++ b/useless/Notebook_AR.ipynb
--- a/useless/Notebook_Fanta.ipynb
+++ b/useless/Notebook_Fanta.ipynb
@ -1,825 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "aa74dbe0-f974-4b5c-94f4-4dba9fbc64fa",
   "metadata": {},
   "source": [
    "# Business Data Challenge - Team 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "94c498e7-7c50-45f9-b3f4-a1ab19b7ccc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a3b50ac-b1ff-4f3d-9938-e048fdc8e027",
   "metadata": {},
   "source": [
    "Configuration de l'accès aux données"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "0b029d42-fb02-481e-a407-7e41886198a6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bdc2324-data/1',\n",
       " 'bdc2324-data/10',\n",
       " 'bdc2324-data/101',\n",
       " 'bdc2324-data/11',\n",
       " 'bdc2324-data/12',\n",
       " 'bdc2324-data/13',\n",
       " 'bdc2324-data/14',\n",
       " 'bdc2324-data/2',\n",
       " 'bdc2324-data/3',\n",
       " 'bdc2324-data/4',\n",
       " 'bdc2324-data/5',\n",
       " 'bdc2324-data/6',\n",
       " 'bdc2324-data/7',\n",
       " 'bdc2324-data/8',\n",
       " 'bdc2324-data/9']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "import s3fs\n",
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
    "\n",
    "BUCKET = \"bdc2324-data\"\n",
    "fs.ls(BUCKET)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fbaf9aa7-ff70-4dbe-a969-b801c593510b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Chargement des fichiers campaign_stats.csv\n",
    "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    campaign_stats_3 = pd.read_csv(file_in, sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1e0418bc-8e97-4a04-b7f3-bda3bef7d36e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Conversion des dates 'sent_at'\n",
    "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n",
    "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n",
    "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "cc5c20ba-e827-4e5a-97a5-7f3947e0621c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2023-11-09 18:10:45+00:00\n",
      "2020-06-02 08:24:08+00:00\n",
      "2023-10-12 01:39:48+00:00\n",
      "2023-10-10 17:06:29+00:00\n",
      "2023-11-01 09:20:48+00:00\n",
      "2021-03-31 14:59:02+00:00\n"
     ]
    }
   ],
   "source": [
    "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n",
    "print(campaign_stats_1['sent_at'].max())\n",
    "print(campaign_stats_1['sent_at'].min())\n",
    "\n",
    "print(campaign_stats_2['sent_at'].max())\n",
    "print(campaign_stats_2['sent_at'].min())\n",
    "\n",
    "print(campaign_stats_3['sent_at'].max())\n",
    "print(campaign_stats_3['sent_at'].min())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c75632df-b018-4bb8-a99d-83f15af94369",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         2021-03-28 16:01:09+00:00\n",
       "1         2021-03-28 16:01:09+00:00\n",
       "2         2021-03-28 16:00:59+00:00\n",
       "3         2021-03-28 16:00:59+00:00\n",
       "4         2021-03-28 16:01:06+00:00\n",
       "                     ...           \n",
       "6214803   2023-10-23 09:32:33+00:00\n",
       "6214804   2023-10-23 09:32:49+00:00\n",
       "6214805   2023-10-23 09:33:28+00:00\n",
       "6214806   2023-10-23 09:31:53+00:00\n",
       "6214807   2023-10-23 09:33:54+00:00\n",
       "Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "campaign_stats_1['sent_at']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f4c0c63e-0418-4cfe-a57d-7af57bca0c22",
   "metadata": {},
   "source": [
    "### Customersplus.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d3bf880d-1065-4d5b-9954-1830aa5081af",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n"
     ]
    }
   ],
   "source": [
    "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n",
    "\n",
    "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "    customers_plus_2 = pd.read_csv(file_in, sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7368f381-db8e-4a4d-9fe2-5947eb55be58",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n",
       "       'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n",
       "       'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n",
       "       'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n",
       "       'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n",
       "       'max_price', 'ticket_sum', 'average_price', 'fidelity',\n",
       "       'average_purchase_delay', 'average_price_basket',\n",
       "       'average_ticket_basket', 'total_price', 'preferred_category',\n",
       "       'preferred_supplier', 'preferred_formula', 'purchase_count',\n",
       "       'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n",
       "       'tenant_id'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "customers_plus_1.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08091935-b159-47fa-806c-e1444f3b227e",
   "metadata": {},
   "outputs": [],
   "source": [
    "customers_plus_1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f8c8868-c1ac-4cee-af08-533d928f6764",
   "metadata": {},
   "outputs": [],
   "source": [
    "customers_plus_1['id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf95daf2-4852-4718-b474-207a1ebd8ac4",
   "metadata": {},
   "outputs": [],
   "source": [
    "customers_plus_2['id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1425c385-3216-4e4f-ae8f-a121624721ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "92533026-e27c-4f1f-81ca-64eda32a34c0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n",
    "# Exemple id commun = caractéristiques communes\n",
    "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n",
    "\n",
    "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "bf9ebc94-0ba6-443d-8e53-22477a6e79a7",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "id                          0.000000\n",
      "lastname                   43.461341\n",
      "firstname                  44.995588\n",
      "birthdate                  96.419870\n",
      "email                       8.622075\n",
      "street_id                   0.000000\n",
      "created_at                  0.000000\n",
      "updated_at                  0.000000\n",
      "civility                  100.000000\n",
      "is_partner                  0.000000\n",
      "extra                     100.000000\n",
      "deleted_at                100.000000\n",
      "reference                 100.000000\n",
      "gender                      0.000000\n",
      "is_email_true               0.000000\n",
      "extra_field               100.000000\n",
      "identifier                  0.000000\n",
      "opt_in                      0.000000\n",
      "structure_id               88.072380\n",
      "note                       99.403421\n",
      "profession                 95.913503\n",
      "language                   99.280945\n",
      "mcp_contact_id             34.876141\n",
      "need_reload                 0.000000\n",
      "last_buying_date           51.653431\n",
      "max_price                  51.653431\n",
      "ticket_sum                  0.000000\n",
      "average_price               8.639195\n",
      "fidelity                    0.000000\n",
      "average_purchase_delay     51.653431\n",
      "average_price_basket       51.653431\n",
      "average_ticket_basket      51.653431\n",
      "total_price                43.014236\n",
      "preferred_category        100.000000\n",
      "preferred_supplier        100.000000\n",
      "preferred_formula         100.000000\n",
      "purchase_count              0.000000\n",
      "first_buying_date          51.653431\n",
      "last_visiting_date        100.000000\n",
      "zipcode                    71.176564\n",
      "country                     5.459418\n",
      "age                        96.419870\n",
      "tenant_id                   0.000000\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "pd.DataFrame(customers_plus_1.isna().mean()*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "6d62e73f-3925-490f-9fd4-d0e838903cb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Chargement de toutes les données\n",
    "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
    "\n",
    "for nom_base in liste_base:\n",
    "    FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
    "    with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
    "        globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "12b24f1c-eb3e-45be-aaf3-b9273180caa3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>lastname</th>\n",
       "      <th>firstname</th>\n",
       "      <th>birthdate</th>\n",
       "      <th>email</th>\n",
       "      <th>street_id</th>\n",
       "      <th>created_at</th>\n",
       "      <th>updated_at</th>\n",
       "      <th>civility</th>\n",
       "      <th>is_partner</th>\n",
       "      <th>...</th>\n",
       "      <th>tenant_id</th>\n",
       "      <th>id_x</th>\n",
       "      <th>customer_id</th>\n",
       "      <th>purchase_date</th>\n",
       "      <th>type_of</th>\n",
       "      <th>is_from_subscription</th>\n",
       "      <th>amount</th>\n",
       "      <th>is_full_price</th>\n",
       "      <th>start_date_time</th>\n",
       "      <th>event_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>405082</td>\n",
       "      <td>lastname405082</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>992423</td>\n",
       "      <td>405082</td>\n",
       "      <td>2023-01-11 17:08:41+01:00</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>13.0</td>\n",
       "      <td>False</td>\n",
       "      <td>2023-02-06 20:00:00+01:00</td>\n",
       "      <td>zaide</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>405082</td>\n",
       "      <td>lastname405082</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
       "      <td>2023-01-12 06:30:31.197484+01:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>992423</td>\n",
       "      <td>405082</td>\n",
       "      <td>2023-01-11 17:08:41+01:00</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>13.0</td>\n",
       "      <td>False</td>\n",
       "      <td>2023-02-06 20:00:00+01:00</td>\n",
       "      <td>zaide</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>411168</td>\n",
       "      <td>lastname411168</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>1053934</td>\n",
       "      <td>411168</td>\n",
       "      <td>2023-03-16 16:23:10+01:00</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>62.0</td>\n",
       "      <td>False</td>\n",
       "      <td>2023-03-19 16:00:00+01:00</td>\n",
       "      <td>luisa miller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>411168</td>\n",
       "      <td>lastname411168</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6</td>\n",
       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
       "      <td>2023-03-17 06:30:35.431967+01:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>1053934</td>\n",
       "      <td>411168</td>\n",
       "      <td>2023-03-16 16:23:10+01:00</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>62.0</td>\n",
       "      <td>False</td>\n",
       "      <td>2023-03-19 16:00:00+01:00</td>\n",
       "      <td>luisa miller</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4380</td>\n",
       "      <td>lastname4380</td>\n",
       "      <td>firstname4380</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>2021-04-22 14:51:55.432952+02:00</td>\n",
       "      <td>2022-04-14 11:41:33.738500+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>1189141</td>\n",
       "      <td>4380</td>\n",
       "      <td>2020-11-26 13:12:53+01:00</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>51.3</td>\n",
       "      <td>False</td>\n",
       "      <td>2020-12-01 20:00:00+01:00</td>\n",
       "      <td>iphigenie en tauride</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>318964</th>\n",
       "      <td>19095</td>\n",
       "      <td>lastname19095</td>\n",
       "      <td>firstname19095</td>\n",
       "      <td>1979-07-16</td>\n",
       "      <td>email19095</td>\n",
       "      <td>6</td>\n",
       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>1090839</td>\n",
       "      <td>19095</td>\n",
       "      <td>2019-05-19 21:18:36+02:00</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>4.5</td>\n",
       "      <td>False</td>\n",
       "      <td>2019-05-27 20:00:00+02:00</td>\n",
       "      <td>entre femmes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>318965</th>\n",
       "      <td>19095</td>\n",
       "      <td>lastname19095</td>\n",
       "      <td>firstname19095</td>\n",
       "      <td>1979-07-16</td>\n",
       "      <td>email19095</td>\n",
       "      <td>6</td>\n",
       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>1090839</td>\n",
       "      <td>19095</td>\n",
       "      <td>2019-05-19 21:18:36+02:00</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>4.5</td>\n",
       "      <td>False</td>\n",
       "      <td>2019-05-27 20:00:00+02:00</td>\n",
       "      <td>entre femmes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>318966</th>\n",
       "      <td>19095</td>\n",
       "      <td>lastname19095</td>\n",
       "      <td>firstname19095</td>\n",
       "      <td>1979-07-16</td>\n",
       "      <td>email19095</td>\n",
       "      <td>6</td>\n",
       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>1090839</td>\n",
       "      <td>19095</td>\n",
       "      <td>2019-05-19 21:18:36+02:00</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>4.5</td>\n",
       "      <td>False</td>\n",
       "      <td>2019-05-27 20:00:00+02:00</td>\n",
       "      <td>entre femmes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>318967</th>\n",
       "      <td>19095</td>\n",
       "      <td>lastname19095</td>\n",
       "      <td>firstname19095</td>\n",
       "      <td>1979-07-16</td>\n",
       "      <td>email19095</td>\n",
       "      <td>6</td>\n",
       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>1244277</td>\n",
       "      <td>19095</td>\n",
       "      <td>2019-12-31 11:04:07+01:00</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>5.5</td>\n",
       "      <td>False</td>\n",
       "      <td>2020-02-03 20:00:00+01:00</td>\n",
       "      <td>a boire et a manger</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>318968</th>\n",
       "      <td>19095</td>\n",
       "      <td>lastname19095</td>\n",
       "      <td>firstname19095</td>\n",
       "      <td>1979-07-16</td>\n",
       "      <td>email19095</td>\n",
       "      <td>6</td>\n",
       "      <td>2021-04-22 15:06:30.120537+02:00</td>\n",
       "      <td>2023-09-12 18:27:36.904104+02:00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>False</td>\n",
       "      <td>...</td>\n",
       "      <td>1556</td>\n",
       "      <td>1244277</td>\n",
       "      <td>19095</td>\n",
       "      <td>2019-12-31 11:04:07+01:00</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>5.5</td>\n",
       "      <td>False</td>\n",
       "      <td>2020-02-03 20:00:00+01:00</td>\n",
       "      <td>a boire et a manger</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>318969 rows × 52 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            id        lastname       firstname   birthdate       email  \\\n",
       "0       405082  lastname405082             NaN         NaN         NaN   \n",
       "1       405082  lastname405082             NaN         NaN         NaN   \n",
       "2       411168  lastname411168             NaN         NaN         NaN   \n",
       "3       411168  lastname411168             NaN         NaN         NaN   \n",
       "4         4380    lastname4380   firstname4380         NaN         NaN   \n",
       "...        ...             ...             ...         ...         ...   \n",
       "318964   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
       "318965   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
       "318966   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
       "318967   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
       "318968   19095   lastname19095  firstname19095  1979-07-16  email19095   \n",
       "\n",
       "        street_id                        created_at  \\\n",
       "0               6  2023-01-12 06:30:31.197484+01:00   \n",
       "1               6  2023-01-12 06:30:31.197484+01:00   \n",
       "2               6  2023-03-17 06:30:35.431967+01:00   \n",
       "3               6  2023-03-17 06:30:35.431967+01:00   \n",
       "4               1  2021-04-22 14:51:55.432952+02:00   \n",
       "...           ...                               ...   \n",
       "318964          6  2021-04-22 15:06:30.120537+02:00   \n",
       "318965          6  2021-04-22 15:06:30.120537+02:00   \n",
       "318966          6  2021-04-22 15:06:30.120537+02:00   \n",
       "318967          6  2021-04-22 15:06:30.120537+02:00   \n",
       "318968          6  2021-04-22 15:06:30.120537+02:00   \n",
       "\n",
       "                              updated_at  civility  is_partner  ...  \\\n",
       "0       2023-01-12 06:30:31.197484+01:00       NaN       False  ...   \n",
       "1       2023-01-12 06:30:31.197484+01:00       NaN       False  ...   \n",
       "2       2023-03-17 06:30:35.431967+01:00       NaN       False  ...   \n",
       "3       2023-03-17 06:30:35.431967+01:00       NaN       False  ...   \n",
       "4       2022-04-14 11:41:33.738500+02:00       NaN       False  ...   \n",
       "...                                  ...       ...         ...  ...   \n",
       "318964  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
       "318965  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
       "318966  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
       "318967  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
       "318968  2023-09-12 18:27:36.904104+02:00       NaN       False  ...   \n",
       "\n",
       "        tenant_id     id_x  customer_id              purchase_date  type_of  \\\n",
       "0            1556   992423       405082  2023-01-11 17:08:41+01:00        3   \n",
       "1            1556   992423       405082  2023-01-11 17:08:41+01:00        3   \n",
       "2            1556  1053934       411168  2023-03-16 16:23:10+01:00        3   \n",
       "3            1556  1053934       411168  2023-03-16 16:23:10+01:00        3   \n",
       "4            1556  1189141         4380  2020-11-26 13:12:53+01:00        3   \n",
       "...           ...      ...          ...                        ...      ...   \n",
       "318964       1556  1090839        19095  2019-05-19 21:18:36+02:00        1   \n",
       "318965       1556  1090839        19095  2019-05-19 21:18:36+02:00        1   \n",
       "318966       1556  1090839        19095  2019-05-19 21:18:36+02:00        1   \n",
       "318967       1556  1244277        19095  2019-12-31 11:04:07+01:00        1   \n",
       "318968       1556  1244277        19095  2019-12-31 11:04:07+01:00        1   \n",
       "\n",
       "        is_from_subscription amount  is_full_price            start_date_time  \\\n",
       "0                      False   13.0          False  2023-02-06 20:00:00+01:00   \n",
       "1                      False   13.0          False  2023-02-06 20:00:00+01:00   \n",
       "2                      False   62.0          False  2023-03-19 16:00:00+01:00   \n",
       "3                      False   62.0          False  2023-03-19 16:00:00+01:00   \n",
       "4                      False   51.3          False  2020-12-01 20:00:00+01:00   \n",
       "...                      ...    ...            ...                        ...   \n",
       "318964                 False    4.5          False  2019-05-27 20:00:00+02:00   \n",
       "318965                 False    4.5          False  2019-05-27 20:00:00+02:00   \n",
       "318966                 False    4.5          False  2019-05-27 20:00:00+02:00   \n",
       "318967                 False    5.5          False  2020-02-03 20:00:00+01:00   \n",
       "318968                 False    5.5          False  2020-02-03 20:00:00+01:00   \n",
       "\n",
       "                  event_name  \n",
       "0                      zaide  \n",
       "1                      zaide  \n",
       "2               luisa miller  \n",
       "3               luisa miller  \n",
       "4       iphigenie en tauride  \n",
       "...                      ...  \n",
       "318964          entre femmes  \n",
       "318965          entre femmes  \n",
       "318966          entre femmes  \n",
       "318967   a boire et a manger  \n",
       "318968   a boire et a manger  \n",
       "\n",
       "[318969 rows x 52 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Jointure\n",
    "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n",
    "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n",
    "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n",
    "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n",
    "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n",
    "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n",
    "df_customer_event"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/useless/TP_access_merge_data.ipynb
+++ b/useless/TP_access_merge_data.ipynb
--- a/useless/Temporary_barplot_example_TP.ipynb
+++ b/useless/Temporary_barplot_example_TP.ipynb
--- a/useless/Traitement_Fanta.ipynb
+++ b/useless/Traitement_Fanta.ipynb
--- a/useless/code_base_train_test.ipynb
+++ b/useless/code_base_train_test.ipynb
@ -1,460 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bf34b03c-536f-4f93-93a5-e452552653aa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "Choisissez le type de compagnie : sport ? musique ? musee ? musique\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
      "Couverture Company 10 : 2016-03-07 - 2023-09-25\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
      "Couverture Company 11 : 2015-06-26 - 2023-11-08\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
      "Couverture Company 12 : 2016-06-14 - 2023-11-08\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
      "Couverture Company 13 : 2010-07-31 - 2023-11-08\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
      "Couverture Company 14 : 1901-01-01 - 2023-11-08\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset test : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset train : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset test : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset train : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset test : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset train : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset test : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset train : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset test : SUCCESS\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
      "File path :  projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
      "Data filtering : SUCCESS\n",
      "KPIs construction : SUCCESS\n",
      "Explanatory variable construction : SUCCESS\n",
      "Explained variable construction : SUCCESS\n",
      "Exportation dataset train : SUCCESS\n",
      "FIN DE LA GENERATION DES DATASETS : SUCCESS\n"
     ]
    }
   ],
   "source": [
    "# Business Data Challenge - Team 1\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import s3fs\n",
    "import re\n",
    "import warnings\n",
    "from datetime import date, timedelta, datetime\n",
    "\n",
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
    "\n",
    "\n",
    "# Import KPI construction functions\n",
    "exec(open('0_KPI_functions.py').read())\n",
    "\n",
    "# Ignore warning\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "\n",
    "def display_covering_time(df, company, datecover):\n",
    "    \"\"\"\n",
    "    This function draws the time coverage of each company\n",
    "    \"\"\"\n",
    "    min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
    "    max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
    "    datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
    "    print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
    "    return datecover\n",
    "\n",
    "\n",
    "def compute_time_intersection(datecover):\n",
    "    \"\"\"\n",
    "    This function returns the time coverage for all companies\n",
    "    \"\"\"\n",
    "    timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
    "    intersection = set.intersection(*timestamps_sets)\n",
    "    intersection_list = list(intersection)\n",
    "    formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
    "    return sorted(formated_dates)\n",
    "\n",
    "\n",
    "def df_coverage_modelization(sport, coverage_train = 0.7):\n",
    "    \"\"\"\n",
    "    This function returns start_date, end_of_features and final dates\n",
    "    that help to construct train and test datasets\n",
    "    \"\"\"\n",
    "    datecover = {}\n",
    "    for company in sport:\n",
    "        df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n",
    "                                                          datetime_col = ['purchase_date'])\n",
    "        datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
    "    #print(datecover.keys())\n",
    "    dt_coverage = compute_time_intersection(datecover)\n",
    "    start_date = dt_coverage[0]\n",
    "    end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
    "    final_date = dt_coverage[-1]\n",
    "    return start_date, end_of_features, final_date\n",
    "    \n",
    "\n",
    "def dataset_construction(min_date, end_features_date, max_date, directory_path):\n",
    "    \n",
    "    # Import customerplus\n",
    "    df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
    "    df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
    "    df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
    "    \n",
    "    # Filtre de cohérence pour la mise en pratique de notre méthode\n",
    "    max_date =  pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n",
    "    end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n",
    "    min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n",
    "\n",
    "    #Filtre de la base df_campaigns_information\n",
    "    df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n",
    "    df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
    "    \n",
    "    #Filtre de la base df_products_purchased_reduced\n",
    "    df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n",
    "\n",
    "    print(\"Data filtering : SUCCESS\")\n",
    "    \n",
    "    # Fusion de l'ensemble et creation des KPI\n",
    "\n",
    "    # KPI sur les campagnes publicitaires\n",
    "    df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n",
    "\n",
    "    # KPI sur le comportement d'achat\n",
    "    df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n",
    "\n",
    "    # KPI sur les données socio-démographiques\n",
    "    df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n",
    "    \n",
    "    print(\"KPIs construction : SUCCESS\")\n",
    "    \n",
    "    # Fusion avec KPI liés au customer\n",
    "    df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n",
    "    \n",
    "    # Fill NaN values\n",
    "    df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n",
    "    \n",
    "    # Fusion avec KPI liés au comportement d'achat\n",
    "    df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n",
    "    \n",
    "    # Fill NaN values\n",
    "    df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n",
    "\n",
    "    print(\"Explanatory variable construction : SUCCESS\")\n",
    "\n",
    "    # 2. Construction of the explained variable \n",
    "    df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n",
    "\n",
    "    # Indicatrice d'achat\n",
    "    df_products_purchased_to_predict['y_has_purchased'] = 1\n",
    "\n",
    "    y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n",
    "\n",
    "    print(\"Explained variable construction : SUCCESS\")\n",
    "    \n",
    "    # 3. Merge between explained and explanatory variables\n",
    "    dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n",
    "\n",
    "    # 0 if there is no purchase\n",
    "    dataset[['y_has_purchased']].fillna(0)\n",
    "\n",
    "    # add id_company prefix to customer_id\n",
    "    dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')\n",
    "    \n",
    "    return dataset\n",
    "\n",
    "## Exportation\n",
    "\n",
    "companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
    "            'sport': ['5', '6', '7', '8', '9'],\n",
    "            'musique' : ['10', '11', '12', '13', '14']}\n",
    "\n",
    "type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n",
    "list_of_comp = companies[type_of_comp] \n",
    "# Dossier d'exportation\n",
    "BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'\n",
    "\n",
    "# Create test dataset and train dataset for sport companies\n",
    "\n",
    "start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n",
    "\n",
    "for company in list_of_comp:\n",
    "    dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
    "                                        max_date = final_date, directory_path = company)    \n",
    "\n",
    "    # Exportation\n",
    "    FILE_KEY_OUT_S3 = \"dataset_test\" + company +  \".csv\"\n",
    "    FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Test_set/\" + FILE_KEY_OUT_S3\n",
    "    \n",
    "    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
    "        dataset_test.to_csv(file_out, index = False)\n",
    "    \n",
    "    print(\"Exportation dataset test : SUCCESS\")\n",
    "\n",
    "# Dataset train\n",
    "    dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
    "                                        max_date = final_date, directory_path = company)\n",
    "    # Export\n",
    "    FILE_KEY_OUT_S3 = \"dataset_train\" + company + \".csv\" \n",
    "    FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Train_test/\" + FILE_KEY_OUT_S3\n",
    "    \n",
    "    with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
    "        dataset_train.to_csv(file_out, index = False)\n",
    "        \n",
    "    print(\"Exportation dataset train : SUCCESS\")\n",
    "\n",
    "\n",
    "print(\"FIN DE LA GENERATION DES DATASETS : SUCCESS\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3721427e-5957-4556-b278-2e7ffca892f4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'projet-bdc2324-team1/Generalization/musique/Train_test/dataset_train14.csv'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "FILE_PATH_OUT_S3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f8546992-f425-4d1e-ad75-ad26a8052a18",
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'projet' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprojet\u001b[49m\u001b[38;5;241m-\u001b[39mbdc2324\u001b[38;5;241m-\u001b[39mteam1\u001b[38;5;241m/\u001b[39mGeneralization\u001b[38;5;241m/\u001b[39mmusique\u001b[38;5;241m/\u001b[39mTrain_test\n",
      "\u001b[0;31mNameError\u001b[0m: name 'projet' is not defined"
     ]
    }
   ],
   "source": [
    "projet-bdc2324-team1/Generalization/musique/Train_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "0dd34710-6da2-4438-9e1d-0ac092c1d28c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(343126, 41)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a3bfeeb6-2db0-4f1d-866c-8721343e97c5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "customer_id               0.000000\n",
       "nb_tickets                0.000000\n",
       "nb_purchases              0.000000\n",
       "total_amount              0.000000\n",
       "nb_suppliers              0.000000\n",
       "vente_internet_max        0.000000\n",
       "purchase_date_min         0.858950\n",
       "purchase_date_max         0.858950\n",
       "time_between_purchase     0.858950\n",
       "nb_tickets_internet       0.000000\n",
       "street_id                 0.000000\n",
       "structure_id              0.869838\n",
       "mcp_contact_id            0.276677\n",
       "fidelity                  0.000000\n",
       "tenant_id                 0.000000\n",
       "is_partner                0.000000\n",
       "deleted_at                1.000000\n",
       "gender                    0.000000\n",
       "is_email_true             0.000000\n",
       "opt_in                    0.000000\n",
       "last_buying_date          0.709626\n",
       "max_price                 0.709626\n",
       "ticket_sum                0.000000\n",
       "average_price             0.709626\n",
       "average_purchase_delay    0.709731\n",
       "average_price_basket      0.709731\n",
       "average_ticket_basket     0.709731\n",
       "total_price               0.000000\n",
       "purchase_count            0.000000\n",
       "first_buying_date         0.709626\n",
       "country                   0.152090\n",
       "gender_label              0.000000\n",
       "gender_female             0.000000\n",
       "gender_male               0.000000\n",
       "gender_other              0.000000\n",
       "country_fr                0.152090\n",
       "has_tags                  0.000000\n",
       "nb_campaigns              0.000000\n",
       "nb_campaigns_opened       0.000000\n",
       "time_to_open              0.848079\n",
       "y_has_purchased           1.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " dataset_train.isna().sum()/dataset_train.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "75f9a672-641f-49a2-a8d6-7673845506f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Creation de la variable dependante fictive: 1 si l'individu a effectué un achat au cours de la periode de train et 0 sinon\n",
    "\n",
    "dataset_train_modif=dataset_train\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c121c1e2-d8e4-4b93-a882-9385581b63c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_train_modif[\""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/useless/code_valeur
+++ b/useless/code_valeur