base_test_train

This commit is contained in:
Fanta RODRIGUE 2024-03-04 23:36:48 +00:00
parent 71a5cb2a3e
commit 66754f957e
2 changed files with 1563 additions and 199 deletions

File diff suppressed because it is too large Load Diff

814
code_base_train_test.ipynb Normal file
View File

@ -0,0 +1,814 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "bf34b03c-536f-4f93-93a5-e452552653aa",
"metadata": {},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Choisissez le type de compagnie : sport ? musique ? musee ? musique\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
"Couverture Company 10 : 2016-03-07 - 2023-09-25\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
"Couverture Company 11 : 2015-06-26 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
"Couverture Company 12 : 2016-06-14 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
"Couverture Company 13 : 2010-07-31 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
"Couverture Company 14 : 1901-01-01 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset test : SUCCESS\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n",
"Data filtering : SUCCESS\n",
"KPIs construction : SUCCESS\n",
"Explanatory variable construction : SUCCESS\n",
"Explained variable construction : SUCCESS\n",
"Exportation dataset train : SUCCESS\n",
"FIN DE LA GENERATION DES DATASETS : SUCCESS\n"
]
}
],
"source": [
"# Business Data Challenge - Team 1\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"import warnings\n",
"from datetime import date, timedelta, datetime\n",
"\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"\n",
"# Import KPI construction functions\n",
"exec(open('0_KPI_functions.py').read())\n",
"\n",
"# Ignore warning\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"\n",
"def display_covering_time(df, company, datecover):\n",
" \"\"\"\n",
" This function draws the time coverage of each company\n",
" \"\"\"\n",
" min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
" max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
" datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
" print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
" return datecover\n",
"\n",
"\n",
"def compute_time_intersection(datecover):\n",
" \"\"\"\n",
" This function returns the time coverage for all companies\n",
" \"\"\"\n",
" timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
" intersection = set.intersection(*timestamps_sets)\n",
" intersection_list = list(intersection)\n",
" formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
" return sorted(formated_dates)\n",
"\n",
"\n",
"def df_coverage_modelization(sport, coverage_train = 0.7):\n",
" \"\"\"\n",
" This function returns start_date, end_of_features and final dates\n",
" that help to construct train and test datasets\n",
" \"\"\"\n",
" datecover = {}\n",
" for company in sport:\n",
" df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n",
" datetime_col = ['purchase_date'])\n",
" datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
" #print(datecover.keys())\n",
" dt_coverage = compute_time_intersection(datecover)\n",
" start_date = dt_coverage[0]\n",
" end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
" final_date = dt_coverage[-1]\n",
" return start_date, end_of_features, final_date\n",
" \n",
"\n",
"def dataset_construction(min_date, end_features_date, max_date, directory_path):\n",
" \n",
" # Import customerplus\n",
" df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
" df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
" df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
" \n",
" # Filtre de cohérence pour la mise en pratique de notre méthode\n",
" max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n",
" end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n",
" min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n",
"\n",
" #Filtre de la base df_campaigns_information\n",
" df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n",
" df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
" \n",
" #Filtre de la base df_products_purchased_reduced\n",
" df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n",
"\n",
" print(\"Data filtering : SUCCESS\")\n",
" \n",
" # Fusion de l'ensemble et creation des KPI\n",
"\n",
" # KPI sur les campagnes publicitaires\n",
" df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n",
"\n",
" # KPI sur le comportement d'achat\n",
" df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n",
"\n",
" # KPI sur les données socio-démographiques\n",
" df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n",
" \n",
" print(\"KPIs construction : SUCCESS\")\n",
" \n",
" # Fusion avec KPI liés au customer\n",
" df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n",
" \n",
" # Fill NaN values\n",
" df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n",
" \n",
" # Fusion avec KPI liés au comportement d'achat\n",
" df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n",
" \n",
" # Fill NaN values\n",
" df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n",
"\n",
" print(\"Explanatory variable construction : SUCCESS\")\n",
"\n",
" # 2. Construction of the explained variable \n",
" df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n",
"\n",
" # Indicatrice d'achat\n",
" df_products_purchased_to_predict['y_has_purchased'] = 1\n",
"\n",
" y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n",
"\n",
" print(\"Explained variable construction : SUCCESS\")\n",
" \n",
" # 3. Merge between explained and explanatory variables\n",
" dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n",
"\n",
" # 0 if there is no purchase\n",
" dataset[['y_has_purchased']].fillna(0)\n",
"\n",
" # add id_company prefix to customer_id\n",
" dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')\n",
" \n",
" return dataset\n",
"\n",
"## Exportation\n",
"\n",
"companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
" 'sport': ['5', '6', '7', '8', '9'],\n",
" 'musique' : ['10', '11', '12', '13', '14']}\n",
"\n",
"type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n",
"list_of_comp = companies[type_of_comp] \n",
"# Dossier d'exportation\n",
"BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'\n",
"\n",
"# Create test dataset and train dataset for sport companies\n",
"\n",
"start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n",
"\n",
"for company in list_of_comp:\n",
" dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
" max_date = final_date, directory_path = company) \n",
"\n",
" # Exportation\n",
" FILE_KEY_OUT_S3 = \"dataset_test\" + company + \".csv\"\n",
" FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Test_set/\" + FILE_KEY_OUT_S3\n",
" \n",
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" dataset_test.to_csv(file_out, index = False)\n",
" \n",
" print(\"Exportation dataset test : SUCCESS\")\n",
"\n",
"# Dataset train\n",
" dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n",
" max_date = final_date, directory_path = company)\n",
" # Export\n",
" FILE_KEY_OUT_S3 = \"dataset_train\" + company + \".csv\" \n",
" FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Train_test/\" + FILE_KEY_OUT_S3\n",
" \n",
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" dataset_train.to_csv(file_out, index = False)\n",
" \n",
" print(\"Exportation dataset train : SUCCESS\")\n",
"\n",
"\n",
"print(\"FIN DE LA GENERATION DES DATASETS : SUCCESS\")\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "3721427e-5957-4556-b278-2e7ffca892f4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'projet-bdc2324-team1/Generalization/musique/Train_test/dataset_train14.csv'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"FILE_PATH_OUT_S3"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a3bfeeb6-2db0-4f1d-866c-8721343e97c5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"customer_id 0.000000\n",
"nb_tickets 0.000000\n",
"nb_purchases 0.000000\n",
"total_amount 0.000000\n",
"nb_suppliers 0.000000\n",
"vente_internet_max 0.000000\n",
"purchase_date_min 0.858950\n",
"purchase_date_max 0.858950\n",
"time_between_purchase 0.858950\n",
"nb_tickets_internet 0.000000\n",
"street_id 0.000000\n",
"structure_id 0.869838\n",
"mcp_contact_id 0.276677\n",
"fidelity 0.000000\n",
"tenant_id 0.000000\n",
"is_partner 0.000000\n",
"deleted_at 1.000000\n",
"gender 0.000000\n",
"is_email_true 0.000000\n",
"opt_in 0.000000\n",
"last_buying_date 0.709626\n",
"max_price 0.709626\n",
"ticket_sum 0.000000\n",
"average_price 0.709626\n",
"average_purchase_delay 0.709731\n",
"average_price_basket 0.709731\n",
"average_ticket_basket 0.709731\n",
"total_price 0.000000\n",
"purchase_count 0.000000\n",
"first_buying_date 0.709626\n",
"country 0.152090\n",
"gender_label 0.000000\n",
"gender_female 0.000000\n",
"gender_male 0.000000\n",
"gender_other 0.000000\n",
"country_fr 0.152090\n",
"has_tags 0.000000\n",
"nb_campaigns 0.000000\n",
"nb_campaigns_opened 0.000000\n",
"time_to_open 0.848079\n",
"y_has_purchased 1.000000\n",
"dtype: float64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
" dataset_train.isna().sum()/dataset_train.shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a4c4e994-231b-4467-aa1b-0a5283c59dd5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>...</th>\n",
" <th>gender_label</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>country_fr</th>\n",
" <th>has_tags</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" <th>y_has_purchased</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>14_1</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>70.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1464.938449</td>\n",
" <td>1464.938021</td>\n",
" <td>0.000428</td>\n",
" <td>2.0</td>\n",
" <td>...</td>\n",
" <td>other</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>9.0</td>\n",
" <td>1.0</td>\n",
" <td>0 days 00:36:13</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>14_2</td>\n",
" <td>7.0</td>\n",
" <td>3.0</td>\n",
" <td>145.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1466.030116</td>\n",
" <td>365.335000</td>\n",
" <td>1100.695116</td>\n",
" <td>7.0</td>\n",
" <td>...</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>9.0</td>\n",
" <td>4.0</td>\n",
" <td>0 days 02:30:09.250000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>14_3</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>70.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1476.907894</td>\n",
" <td>1476.907662</td>\n",
" <td>0.000231</td>\n",
" <td>2.0</td>\n",
" <td>...</td>\n",
" <td>female</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>6.0</td>\n",
" <td>1.0</td>\n",
" <td>0 days 20:58:45</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>14_4</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>32.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1465.907894</td>\n",
" <td>1465.907465</td>\n",
" <td>0.000428</td>\n",
" <td>2.0</td>\n",
" <td>...</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>6.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>14_5</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>70.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1465.373866</td>\n",
" <td>1465.373819</td>\n",
" <td>0.000046</td>\n",
" <td>2.0</td>\n",
" <td>...</td>\n",
" <td>female</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>7.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>343121</th>\n",
" <td>14_6884748</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>343122</th>\n",
" <td>14_6884749</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>343123</th>\n",
" <td>14_6884750</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>343124</th>\n",
" <td>14_6884751</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>female</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>343125</th>\n",
" <td>14_6884753</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>343126 rows × 41 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 14_1 2.0 2.0 70.0 1.0 \n",
"1 14_2 7.0 3.0 145.0 2.0 \n",
"2 14_3 2.0 2.0 70.0 1.0 \n",
"3 14_4 2.0 2.0 32.0 1.0 \n",
"4 14_5 2.0 2.0 70.0 1.0 \n",
"... ... ... ... ... ... \n",
"343121 14_6884748 0.0 0.0 0.0 0.0 \n",
"343122 14_6884749 0.0 0.0 0.0 0.0 \n",
"343123 14_6884750 0.0 0.0 0.0 0.0 \n",
"343124 14_6884751 0.0 0.0 0.0 0.0 \n",
"343125 14_6884753 0.0 0.0 0.0 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 1.0 1464.938449 1464.938021 \n",
"1 1.0 1466.030116 365.335000 \n",
"2 1.0 1476.907894 1476.907662 \n",
"3 1.0 1465.907894 1465.907465 \n",
"4 1.0 1465.373866 1465.373819 \n",
"... ... ... ... \n",
"343121 0.0 NaN NaN \n",
"343122 0.0 NaN NaN \n",
"343123 0.0 NaN NaN \n",
"343124 0.0 NaN NaN \n",
"343125 0.0 NaN NaN \n",
"\n",
" time_between_purchase nb_tickets_internet ... gender_label \\\n",
"0 0.000428 2.0 ... other \n",
"1 1100.695116 7.0 ... male \n",
"2 0.000231 2.0 ... female \n",
"3 0.000428 2.0 ... male \n",
"4 0.000046 2.0 ... female \n",
"... ... ... ... ... \n",
"343121 NaN 0.0 ... male \n",
"343122 NaN 0.0 ... male \n",
"343123 NaN 0.0 ... male \n",
"343124 NaN 0.0 ... female \n",
"343125 NaN 0.0 ... male \n",
"\n",
" gender_female gender_male gender_other country_fr has_tags \\\n",
"0 0 0 1 1.0 0 \n",
"1 0 1 0 1.0 1 \n",
"2 1 0 0 1.0 0 \n",
"3 0 1 0 1.0 0 \n",
"4 1 0 0 1.0 0 \n",
"... ... ... ... ... ... \n",
"343121 0 1 0 1.0 0 \n",
"343122 0 1 0 1.0 0 \n",
"343123 0 1 0 1.0 0 \n",
"343124 1 0 0 1.0 0 \n",
"343125 0 1 0 1.0 0 \n",
"\n",
" nb_campaigns nb_campaigns_opened time_to_open \\\n",
"0 9.0 1.0 0 days 00:36:13 \n",
"1 9.0 4.0 0 days 02:30:09.250000 \n",
"2 6.0 1.0 0 days 20:58:45 \n",
"3 6.0 0.0 NaT \n",
"4 7.0 0.0 NaT \n",
"... ... ... ... \n",
"343121 0.0 0.0 NaT \n",
"343122 0.0 0.0 NaT \n",
"343123 0.0 0.0 NaT \n",
"343124 0.0 0.0 NaT \n",
"343125 0.0 0.0 NaT \n",
"\n",
" y_has_purchased \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"343121 NaN \n",
"343122 NaN \n",
"343123 NaN \n",
"343124 NaN \n",
"343125 NaN \n",
"\n",
"[343126 rows x 41 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_test"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75f9a672-641f-49a2-a8d6-7673845506f5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}