BDC-team-1/Sport/exploration_sport.ipynb
2024-03-06 12:42:55 +00:00

2297 lines
98 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "314bf34b-1f6d-4a99-8f82-aa71ebacdabc",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import s3fs\n",
"import warnings\n",
"from datetime import date, timedelta, datetime\n",
"import numpy as np\n",
"\n",
"exec(open('../0_KPI_functions.py').read())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a276822a-c389-429e-b249-8a9e47758bfc",
"metadata": {},
"outputs": [],
"source": [
"# Ignore warning\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "f62b996c-4e17-40ea-83ba-f0cb60be7671",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/1',\n",
" 'bdc2324-data/10',\n",
" 'bdc2324-data/101',\n",
" 'bdc2324-data/11',\n",
" 'bdc2324-data/12',\n",
" 'bdc2324-data/13',\n",
" 'bdc2324-data/14',\n",
" 'bdc2324-data/2',\n",
" 'bdc2324-data/3',\n",
" 'bdc2324-data/4',\n",
" 'bdc2324-data/5',\n",
" 'bdc2324-data/6',\n",
" 'bdc2324-data/7',\n",
" 'bdc2324-data/8',\n",
" 'bdc2324-data/9']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"BUCKET = \"bdc2324-data\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "markdown",
"id": "2c829aa8-2006-4e72-889b-7096dd55718b",
"metadata": {},
"source": [
"## Look at the time sequence of each company and compute inter time coverage"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "e86864b7-4852-449a-8680-638559d56080",
"metadata": {},
"outputs": [],
"source": [
"sport = ['5', '6', '7', '8', '9']"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "7634ec57-4891-4684-8638-1e1643baca28",
"metadata": {},
"outputs": [],
"source": [
"def display_covering_time(df, company, datecover):\n",
" \"\"\"\n",
" This function draws the time coverage of each company\n",
" \"\"\"\n",
" min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
" max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
" datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
" print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
" return datecover"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "53c83f51-822c-4e05-8c7c-89aa327603c6",
"metadata": {},
"outputs": [],
"source": [
"def compute_time_intersection(datecover):\n",
" timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
" intersection = set.intersection(*timestamps_sets)\n",
" intersection_list = list(intersection)\n",
" formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
" return sorted(formated_dates)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"id": "eec152de-078e-44c4-ad6e-74ae6ba5c65a",
"metadata": {},
"outputs": [],
"source": [
"def df_coverage_modelization(sport, coverage_train = 0.7):\n",
" \"\"\"\n",
" This function returns start_date, end_of_features and final dates\n",
" that help to construct train and test datasets\n",
" \"\"\"\n",
" datecover = {}\n",
" for company in sport:\n",
" df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n",
" datetime_col = ['purchase_date'])\n",
" datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
" #print(datecover.keys())\n",
" dt_coverage = compute_time_intersection(datecover)\n",
" start_date = dt_coverage[0]\n",
" end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
" final_date = dt_coverage[-1]\n",
" return start_date, end_of_features, final_date\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "348f246a-bc2d-4bbc-ba05-aa825da15a69",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_5/products_purchased_reduced.csv\n",
"Couverture Company 5 : 2019-04-15 - 2023-11-09\n",
"File path : projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n",
"Couverture Company 6 : 2018-06-28 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_7/products_purchased_reduced.csv\n",
"Couverture Company 7 : 2015-02-10 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_8/products_purchased_reduced.csv\n",
"Couverture Company 8 : 2010-09-28 - 2023-11-08\n",
"File path : projet-bdc2324-team1/0_Input/Company_9/products_purchased_reduced.csv\n",
"Couverture Company 9 : 2014-09-22 - 2023-10-24\n",
"dict_keys(['5', '6', '7', '8', '9'])\n",
"2019-04-15 2022-06-15 2023-10-23\n"
]
}
],
"source": [
"start_date, end_of_features, final_date = df_coverage_modelization(sport, coverage_train = 0.7)\n",
"print(start_date, end_of_features, final_date )"
]
},
{
"cell_type": "markdown",
"id": "34ddc267-4daa-4926-9d54-5b13d4212eaa",
"metadata": {},
"source": [
"## Look at common database between Sport companies"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "389387fa-2046-4811-b8dd-6d524e91fe2e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bdc2324-data/5',\n",
" 'bdc2324-data/6',\n",
" 'bdc2324-data/7',\n",
" 'bdc2324-data/8',\n",
" 'bdc2324-data/9']"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"companies = fs.ls(BUCKET)\n",
"companies = [company for company in companies if any(company.endswith(end) for end in sport)]\n",
"companies"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "895fc2b3-c768-454d-bedb-54994e4d211a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of databases : 30\n",
"Number of common databases : 23\n"
]
}
],
"source": [
"companies_database = {}\n",
"\n",
"for company in companies:\n",
" companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n",
"\n",
"all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n",
"print(\"Number of databases : \",len(all_database))\n",
"\n",
"data_in_common = set(all_database)\n",
"\n",
"for key in companies_database:\n",
" diff_database = data_in_common.symmetric_difference(companies_database[key])\n",
" data_in_common = data_in_common - diff_database\n",
"\n",
"print(\"Number of common databases : \",len(data_in_common))"
]
},
{
"cell_type": "code",
"execution_count": 121,
"id": "0c06517d-f5b7-4104-94fa-0e3f843c5881",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'campaign_stats.csv',\n",
" 'campaigns.csv',\n",
" 'categories.csv',\n",
" 'countries.csv',\n",
" 'currencies.csv',\n",
" 'customer_target_mappings.csv',\n",
" 'customersplus.csv',\n",
" 'event_types.csv',\n",
" 'events.csv',\n",
" 'facilities.csv',\n",
" 'link_stats.csv',\n",
" 'pricing_formulas.csv',\n",
" 'product_packs.csv',\n",
" 'products.csv',\n",
" 'products_groups.csv',\n",
" 'purchases.csv',\n",
" 'representation_category_capacities.csv',\n",
" 'representations.csv',\n",
" 'seasons.csv',\n",
" 'suppliers.csv',\n",
" 'target_types.csv',\n",
" 'targets.csv',\n",
" 'tickets.csv'}"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_in_common"
]
},
{
"cell_type": "markdown",
"id": "1af245aa-44a7-453b-90f9-0c4bcc415cd0",
"metadata": {},
"source": [
"## Investigate errors from data construction for company 6"
]
},
{
"cell_type": "code",
"execution_count": 108,
"id": "538a5ca2-a50d-4726-93eb-c2b0d0ab8400",
"metadata": {},
"outputs": [],
"source": [
"directory_path = '6'"
]
},
{
"cell_type": "code",
"execution_count": 143,
"id": "1ca3fb71-930a-441c-b35b-b98bca780606",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_6/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n"
]
}
],
"source": [
"df_customerplus_clean = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
"df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
"df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])"
]
},
{
"cell_type": "code",
"execution_count": 144,
"id": "2ad3052c-e9e6-4ef9-abe2-4b8b2306a2b9",
"metadata": {},
"outputs": [],
"source": [
"max_date = pd.to_datetime(final_date, utc = True, format = 'ISO8601') \n",
"end_features_date = pd.to_datetime(end_of_features, utc = True, format = 'ISO8601')\n",
"min_date = pd.to_datetime(start_date, utc = True, format = 'ISO8601')"
]
},
{
"cell_type": "code",
"execution_count": 128,
"id": "146999f2-ab92-4b7c-8c57-2e3ac8c4dd88",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n"
]
}
],
"source": [
"df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])"
]
},
{
"cell_type": "code",
"execution_count": 133,
"id": "7448a7b9-3edf-4177-9df2-a260ebbee45e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2022-06-15 00:00:00+0000', tz='UTC')"
]
},
"execution_count": 133,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"end_features_date"
]
},
{
"cell_type": "code",
"execution_count": 136,
"id": "d8e954ab-65d4-4f36-8410-69bf664773a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Shape campaigns_information : (1333010, 8)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>opened_at</th>\n",
" <th>sent_at</th>\n",
" <th>delivered_at</th>\n",
" <th>campaign_name</th>\n",
" <th>campaign_service_id</th>\n",
" <th>campaign_sent_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>38</td>\n",
" <td>NaT</td>\n",
" <td>2022-08-02 18:31:33+00:00</td>\n",
" <td>NaN</td>\n",
" <td>Adhérents non ré-engagés</td>\n",
" <td>15</td>\n",
" <td>2022-08-02 18:31:36+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>26135</td>\n",
" <td>NaT</td>\n",
" <td>2022-08-02 18:31:34+00:00</td>\n",
" <td>NaN</td>\n",
" <td>Adhérents non ré-engagés</td>\n",
" <td>15</td>\n",
" <td>2022-08-02 18:31:36+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>3876</td>\n",
" <td>NaT</td>\n",
" <td>2022-08-02 18:31:35+00:00</td>\n",
" <td>NaN</td>\n",
" <td>Adhérents non ré-engagés</td>\n",
" <td>15</td>\n",
" <td>2022-08-02 18:31:36+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>26226</td>\n",
" <td>NaT</td>\n",
" <td>2022-08-02 18:31:35+00:00</td>\n",
" <td>NaN</td>\n",
" <td>Adhérents non ré-engagés</td>\n",
" <td>15</td>\n",
" <td>2022-08-02 18:31:36+00:00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>25349</td>\n",
" <td>NaT</td>\n",
" <td>2022-08-02 18:31:34+00:00</td>\n",
" <td>NaN</td>\n",
" <td>Adhérents non ré-engagés</td>\n",
" <td>15</td>\n",
" <td>2022-08-02 18:31:36+00:00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id customer_id opened_at sent_at delivered_at \\\n",
"0 1 38 NaT 2022-08-02 18:31:33+00:00 NaN \n",
"1 2 26135 NaT 2022-08-02 18:31:34+00:00 NaN \n",
"2 3 3876 NaT 2022-08-02 18:31:35+00:00 NaN \n",
"3 4 26226 NaT 2022-08-02 18:31:35+00:00 NaN \n",
"4 5 25349 NaT 2022-08-02 18:31:34+00:00 NaN \n",
"\n",
" campaign_name campaign_service_id campaign_sent_at \n",
"0 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n",
"1 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n",
"2 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n",
"3 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n",
"4 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 "
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"Shape campaigns_information : \", df_campaigns_information.shape)\n",
"df_campaigns_information.head()"
]
},
{
"cell_type": "code",
"execution_count": 134,
"id": "93eceaf1-ce4c-4dfa-9c51-4fd016d09fc5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2022-08-02 18:31:33+0000', tz='UTC')"
]
},
"execution_count": 134,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_campaigns_information['sent_at'].min()"
]
},
{
"cell_type": "code",
"execution_count": 137,
"id": "ea50cab4-1dae-4efe-ae3c-22b6f9ad1d26",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2023-11-07 10:08:16+0000', tz='UTC')"
]
},
"execution_count": 137,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_campaigns_information['sent_at'].max()"
]
},
{
"cell_type": "code",
"execution_count": 127,
"id": "dcb87bc9-caf5-4655-9cfa-4a3dad504bac",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>opened_at</th>\n",
" <th>sent_at</th>\n",
" <th>delivered_at</th>\n",
" <th>campaign_name</th>\n",
" <th>campaign_service_id</th>\n",
" <th>campaign_sent_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [id, customer_id, opened_at, sent_at, delivered_at, campaign_name, campaign_service_id, campaign_sent_at]\n",
"Index: []"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Filtre de la base df_campaigns_information\n",
"df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n",
"df_campaigns_information"
]
},
{
"cell_type": "code",
"execution_count": 145,
"id": "abe22e09-a041-4349-be8f-b0784f2f0a98",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>customer_id</th>\n",
" <th>purchase_id</th>\n",
" <th>event_type_id</th>\n",
" <th>supplier_name</th>\n",
" <th>purchase_date</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilities</th>\n",
" <th>name_categories</th>\n",
" <th>name_events</th>\n",
" <th>name_seasons</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>91401</td>\n",
" <td>108392</td>\n",
" <td>1259025.0</td>\n",
" <td>4</td>\n",
" <td>caisse</td>\n",
" <td>2022-02-27 13:44:10.690000+00:00</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>ligue 1 uber eats</td>\n",
" <td>stade de l'aube</td>\n",
" <td>honneur basse</td>\n",
" <td>olympique de marseille</td>\n",
" <td>saison 2021-2022</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>535527</td>\n",
" <td>31304</td>\n",
" <td>136629.0</td>\n",
" <td>4</td>\n",
" <td>adhésion</td>\n",
" <td>2022-04-28 15:47:52.790000+00:00</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>ligue 1 uber eats</td>\n",
" <td>stade de l'aube</td>\n",
" <td>honneur basse</td>\n",
" <td>ac ajaccio</td>\n",
" <td>saison 2022-2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>547400</td>\n",
" <td>192</td>\n",
" <td>140477.0</td>\n",
" <td>4</td>\n",
" <td>adhésion</td>\n",
" <td>2022-04-28 15:47:54.053000+00:00</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>ligue 1 uber eats</td>\n",
" <td>stade de l'aube</td>\n",
" <td>honneur basse</td>\n",
" <td>rc strasbourg</td>\n",
" <td>saison 2022-2023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>304</th>\n",
" <td>84413</td>\n",
" <td>31388</td>\n",
" <td>20259.0</td>\n",
" <td>4</td>\n",
" <td>adhésion</td>\n",
" <td>2021-08-03 13:45:01.603000+00:00</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>ligue 1 uber eats</td>\n",
" <td>stade de l'aube</td>\n",
" <td>vitoux haute</td>\n",
" <td>olympique de marseille</td>\n",
" <td>saison 2021-2022</td>\n",
" </tr>\n",
" <tr>\n",
" <th>311</th>\n",
" <td>407271</td>\n",
" <td>3265</td>\n",
" <td>90527.0</td>\n",
" <td>4</td>\n",
" <td>web [adhésion]</td>\n",
" <td>2022-05-26 09:15:40.993000+00:00</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>ligue 1 uber eats</td>\n",
" <td>stade de l'aube</td>\n",
" <td>champagne basse</td>\n",
" <td>stade brestois 29</td>\n",
" <td>saison 2022-2023</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ticket_id customer_id purchase_id event_type_id supplier_name \\\n",
"49 91401 108392 1259025.0 4 caisse \n",
"117 535527 31304 136629.0 4 adhésion \n",
"274 547400 192 140477.0 4 adhésion \n",
"304 84413 31388 20259.0 4 adhésion \n",
"311 407271 3265 90527.0 4 web [adhésion] \n",
"\n",
" purchase_date amount is_full_price \\\n",
"49 2022-02-27 13:44:10.690000+00:00 0.0 False \n",
"117 2022-04-28 15:47:52.790000+00:00 0.0 False \n",
"274 2022-04-28 15:47:54.053000+00:00 0.0 False \n",
"304 2021-08-03 13:45:01.603000+00:00 0.0 False \n",
"311 2022-05-26 09:15:40.993000+00:00 0.0 False \n",
"\n",
" name_event_types name_facilities name_categories \\\n",
"49 ligue 1 uber eats stade de l'aube honneur basse \n",
"117 ligue 1 uber eats stade de l'aube honneur basse \n",
"274 ligue 1 uber eats stade de l'aube honneur basse \n",
"304 ligue 1 uber eats stade de l'aube vitoux haute \n",
"311 ligue 1 uber eats stade de l'aube champagne basse \n",
"\n",
" name_events name_seasons \n",
"49 olympique de marseille saison 2021-2022 \n",
"117 ac ajaccio saison 2022-2023 \n",
"274 rc strasbourg saison 2022-2023 \n",
"304 olympique de marseille saison 2021-2022 \n",
"311 stade brestois 29 saison 2022-2023 "
]
},
"execution_count": 145,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Filtre de la base df_products_purchased_reduced\n",
"df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n",
"df_products_purchased_reduced.head()"
]
},
{
"cell_type": "code",
"execution_count": 150,
"id": "ae7ef3a6-5b42-4a3c-a108-fec9f2ec4d32",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['caisse', 'adhésion', 'web [adhésion]', 'web [grand public]',\n",
" 'itr ticketmaster', 'itr fnac', nan, 'decathlon', 'boutique web',\n",
" 'boutique officielle'], dtype=object)"
]
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_products_purchased_reduced[\"supplier_name\"].unique()"
]
},
{
"cell_type": "code",
"execution_count": 151,
"id": "942f58a5-8ed4-4b18-a7a2-bd296447fa6a",
"metadata": {},
"outputs": [],
"source": [
"# KPI sur le comportement d'achat\n",
"tickets_information_copy = df_products_purchased_reduced.copy()\n",
"# Dummy : Canal de vente en ligne\n",
"liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n",
"tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)"
]
},
{
"cell_type": "markdown",
"id": "658b57cd-4fb8-4552-a582-972144b2af1c",
"metadata": {},
"source": [
"tickets_information_copy['vente_internet'] corrected by handling na"
]
},
{
"cell_type": "markdown",
"id": "99a75c34-f393-433a-b3c2-dc3f6f2f3e7e",
"metadata": {},
"source": [
"## Investigate train and test"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "970302f5-4de2-46b4-a1ce-a5396f5330ab",
"metadata": {},
"outputs": [],
"source": [
"def display_databases(directory_path, file_name):\n",
" \"\"\"\n",
" This function returns the file from s3 storage \n",
" \"\"\"\n",
" file_path = \"projet-bdc2324-team1\" + \"/Generalization/\" + directory_path + \"/\" + file_name + \".csv\"\n",
" print(\"File path : \", file_path)\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\") \n",
" return df "
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f5bfae82-04aa-44e1-9869-3f4fd5736b41",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/Generalization/sport/Train_set.csv\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>...</th>\n",
" <th>country</th>\n",
" <th>gender_label</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>country_fr</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" <th>y_has_purchased</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5_6046652</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>af</td>\n",
" <td>other</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5_3789159</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>fr</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5_5991148</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>af</td>\n",
" <td>other</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5_3848065</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>fr</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5_6154495</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>af</td>\n",
" <td>other</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 40 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 5_6046652 0.0 0.0 0.0 0.0 \n",
"1 5_3789159 0.0 0.0 0.0 0.0 \n",
"2 5_5991148 0.0 0.0 0.0 0.0 \n",
"3 5_3848065 0.0 0.0 0.0 0.0 \n",
"4 5_6154495 0.0 0.0 0.0 0.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 \n",
"\n",
" time_between_purchase nb_tickets_internet ... country gender_label \\\n",
"0 0.0 0.0 ... af other \n",
"1 0.0 0.0 ... fr male \n",
"2 0.0 0.0 ... af other \n",
"3 0.0 0.0 ... fr male \n",
"4 0.0 0.0 ... af other \n",
"\n",
" gender_female gender_male gender_other country_fr nb_campaigns \\\n",
"0 0 0 1 0.0 0.0 \n",
"1 0 1 0 1.0 0.0 \n",
"2 0 0 1 0.0 0.0 \n",
"3 0 1 0 1.0 0.0 \n",
"4 0 0 1 0.0 0.0 \n",
"\n",
" nb_campaigns_opened time_to_open y_has_purchased \n",
"0 0.0 0 0.0 \n",
"1 0.0 0 0.0 \n",
"2 0.0 0 0.0 \n",
"3 0.0 0 0.0 \n",
"4 0.0 0 0.0 \n",
"\n",
"[5 rows x 40 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_sport = display_databases('sport', 'Train_set').fillna(0)\n",
"train_sport.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "56d5b12e-45e8-4312-869d-bde4d24900b6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape : (426449, 40)\n",
"number of na explained variable : 369102\n"
]
}
],
"source": [
"print('shape : ', train_sport.shape) \n",
"print('number of na explained variable : ', train_sport['y_has_purchased'].isna().sum())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "13bff83a-e931-4286-a3f2-1382462703f4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: xlabel='y_has_purchased', ylabel='count'>"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"\n",
"sns.countplot(train_sport, x='y_has_purchased')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d056c7b3-0e8c-485c-b2f3-4681077f1c2e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['projet-bdc2324-team1/Generalization/sport/Test_set',\n",
" 'projet-bdc2324-team1/Generalization/sport/Test_set.csv',\n",
" 'projet-bdc2324-team1/Generalization/sport/Train_set',\n",
" 'projet-bdc2324-team1/Generalization/sport/Train_set.csv']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fs.ls('projet-bdc2324-team1/Generalization/sport')"
]
},
{
"cell_type": "markdown",
"id": "6a9963be-e17b-4cb3-a795-35cece44ce97",
"metadata": {},
"source": [
"## Look at y_has_purchased"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "907bb25a-b555-4cfa-bfc9-785120ae4292",
"metadata": {},
"outputs": [],
"source": [
"def display_databases(directory_path, file_name, datetime_col = None):\n",
" \"\"\"\n",
" This function returns the file from s3 storage \n",
" \"\"\"\n",
" file_path = \"projet-bdc2324-team1\" + \"/0_Input/Company_\" + directory_path + \"/\" + file_name + \".csv\"\n",
" print(\"File path : \", file_path)\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser) \n",
" return df "
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "d3164f81-0ef2-4f12-bc56-b7a999c4a9cd",
"metadata": {},
"outputs": [],
"source": [
"directory_path = '5'\n",
"# start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n",
"min_date = \"2021-05-01\"\n",
"end_features_date = \"2022-11-01\"\n",
"max_date = \"2023-11-01\""
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "7cb31d80-41ca-4c2b-89b6-ee50486e7298",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_5/customerplus_cleaned.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_5/campaigns_information.csv\n",
"File path : projet-bdc2324-team1/0_Input/Company_5/products_purchased_reduced.csv\n"
]
}
],
"source": [
"df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
"df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\",\n",
" datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
"df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\",\n",
" datetime_col = ['purchase_date'])\n",
"\n",
"# Filtre de cohérence pour la mise en pratique de notre méthode\n",
"max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n",
"end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n",
"min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n",
"\n",
"df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n",
"df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n",
"\n",
"#Filtre de la base df_products_purchased_reduced\n",
"df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "1d63a61e-22b4-4224-89d4-18444276cfaa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>opened_at</th>\n",
" <th>sent_at</th>\n",
" <th>delivered_at</th>\n",
" <th>campaign_name</th>\n",
" <th>campaign_service_id</th>\n",
" <th>campaign_sent_at</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [id, customer_id, opened_at, sent_at, delivered_at, campaign_name, campaign_service_id, campaign_sent_at]\n",
"Index: []"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_campaigns_information.head()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "a27a80c1-0be2-4199-96e7-566d568b1f51",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>customer_id</th>\n",
" <th>purchase_id</th>\n",
" <th>event_type_id</th>\n",
" <th>supplier_name</th>\n",
" <th>purchase_date</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilities</th>\n",
" <th>name_categories</th>\n",
" <th>name_events</th>\n",
" <th>name_seasons</th>\n",
" <th>start_date_time</th>\n",
" <th>end_date_time</th>\n",
" <th>open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6287839</td>\n",
" <td>204007</td>\n",
" <td>545836.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-03-31 03:42:59+00:00</td>\n",
" <td>55.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6287840</td>\n",
" <td>204007</td>\n",
" <td>545836.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-03-31 03:42:59+00:00</td>\n",
" <td>30.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6154548</td>\n",
" <td>227006</td>\n",
" <td>535225.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-02-28 16:31:29+00:00</td>\n",
" <td>55.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6154549</td>\n",
" <td>227006</td>\n",
" <td>535225.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-02-28 16:31:29+00:00</td>\n",
" <td>55.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6287843</td>\n",
" <td>407930</td>\n",
" <td>545838.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-03-31 04:00:22+00:00</td>\n",
" <td>55.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ticket_id customer_id purchase_id event_type_id supplier_name \\\n",
"0 6287839 204007 545836.0 824 fov \n",
"1 6287840 204007 545836.0 824 fov \n",
"2 6154548 227006 535225.0 824 fov \n",
"3 6154549 227006 535225.0 824 fov \n",
"4 6287843 407930 545838.0 824 fov \n",
"\n",
" purchase_date amount is_full_price name_event_types \\\n",
"0 2022-03-31 03:42:59+00:00 55.0 False match rugby \n",
"1 2022-03-31 03:42:59+00:00 30.0 False match rugby \n",
"2 2022-02-28 16:31:29+00:00 55.0 False match rugby \n",
"3 2022-02-28 16:31:29+00:00 55.0 False match rugby \n",
"4 2022-03-31 04:00:22+00:00 55.0 False match rugby \n",
"\n",
" name_facilities name_categories name_events \\\n",
"0 jean bouin centrale sf paris / racing 92 (ercc) \n",
"1 jean bouin centrale sf paris / racing 92 (ercc) \n",
"2 jean bouin centrale sf paris / racing 92 (ercc) \n",
"3 jean bouin centrale sf paris / racing 92 (ercc) \n",
"4 jean bouin centrale sf paris / racing 92 (ercc) \n",
"\n",
" name_seasons start_date_time end_date_time \\\n",
"0 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"1 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"2 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"3 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"4 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"\n",
" open \n",
"0 True \n",
"1 True \n",
"2 True \n",
"3 True \n",
"4 True "
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_products_purchased_reduced.head()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "f47357ab-0216-4f70-ab8f-6767819e1cdb",
"metadata": {},
"outputs": [],
"source": [
"# Fusion de l'ensemble et creation des KPI\n",
"\n",
"# KPI sur les campagnes publicitaires\n",
"df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n",
"\n",
"# KPI sur le comportement d'achat\n",
"df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n",
"\n",
"# KPI sur les données socio-démographiques\n",
"df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "3d08a2f8-3c83-41c7-98f8-4be268ffa0da",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>street_id</th>\n",
" <th>structure_id</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>fidelity</th>\n",
" <th>tenant_id</th>\n",
" <th>is_partner</th>\n",
" <th>deleted_at</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>...</th>\n",
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" <th>gender_label</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>country_fr</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6009745</td>\n",
" <td>1372685</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>af</td>\n",
" <td>other</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6011228</td>\n",
" <td>1372685</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>af</td>\n",
" <td>other</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6058950</td>\n",
" <td>1372685</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>af</td>\n",
" <td>other</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6062404</td>\n",
" <td>1372685</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>af</td>\n",
" <td>other</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>250217</td>\n",
" <td>78785</td>\n",
" <td>NaN</td>\n",
" <td>11035.0</td>\n",
" <td>0</td>\n",
" <td>1771</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>fr</td>\n",
" <td>female</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 30 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id street_id structure_id mcp_contact_id fidelity tenant_id \\\n",
"0 6009745 1372685 NaN NaN 0 1771 \n",
"1 6011228 1372685 NaN NaN 0 1771 \n",
"2 6058950 1372685 NaN NaN 0 1771 \n",
"3 6062404 1372685 NaN NaN 0 1771 \n",
"4 250217 78785 NaN 11035.0 0 1771 \n",
"\n",
" is_partner deleted_at gender is_email_true ... first_buying_date \\\n",
"0 False NaN 2 True ... NaN \n",
"1 False NaN 2 True ... NaN \n",
"2 False NaN 2 True ... NaN \n",
"3 False NaN 2 True ... NaN \n",
"4 False NaN 0 True ... NaN \n",
"\n",
" country gender_label gender_female gender_male gender_other country_fr \\\n",
"0 af other 0 0 1 0.0 \n",
"1 af other 0 0 1 0.0 \n",
"2 af other 0 0 1 0.0 \n",
"3 af other 0 0 1 0.0 \n",
"4 fr female 1 0 0 1.0 \n",
"\n",
" nb_campaigns nb_campaigns_opened time_to_open \n",
"0 NaN NaN NaT \n",
"1 NaN NaN NaT \n",
"2 NaN NaN NaT \n",
"3 NaN NaN NaT \n",
"4 NaN NaN NaT \n",
"\n",
"[5 rows x 30 columns]"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fusion avec KPI liés au customer\n",
"df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n",
"df_customer.head()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "bc3d1aed-b2af-48e5-a920-626f2abc3358",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>nb_tickets</th>\n",
" <th>nb_purchases</th>\n",
" <th>total_amount</th>\n",
" <th>nb_suppliers</th>\n",
" <th>vente_internet_max</th>\n",
" <th>purchase_date_min</th>\n",
" <th>purchase_date_max</th>\n",
" <th>time_between_purchase</th>\n",
" <th>nb_tickets_internet</th>\n",
" <th>...</th>\n",
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" <th>gender_label</th>\n",
" <th>gender_female</th>\n",
" <th>gender_male</th>\n",
" <th>gender_other</th>\n",
" <th>country_fr</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>160516</td>\n",
" <td>149.0</td>\n",
" <td>3.0</td>\n",
" <td>4470.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>409.693137</td>\n",
" <td>66.356979</td>\n",
" <td>343.336157</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>2021-09-17 06:39:19+00:00</td>\n",
" <td>fr</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>160517</td>\n",
" <td>1977.0</td>\n",
" <td>27.0</td>\n",
" <td>1473.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>431.558519</td>\n",
" <td>27.733472</td>\n",
" <td>403.825046</td>\n",
" <td>15.0</td>\n",
" <td>...</td>\n",
" <td>2021-08-26 09:53:10+00:00</td>\n",
" <td>fr</td>\n",
" <td>female</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>160518</td>\n",
" <td>116.0</td>\n",
" <td>8.0</td>\n",
" <td>439.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>427.177720</td>\n",
" <td>23.689340</td>\n",
" <td>403.488380</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>2021-08-30 19:01:31+00:00</td>\n",
" <td>fr</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>160519</td>\n",
" <td>34.0</td>\n",
" <td>2.0</td>\n",
" <td>608.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>483.642940</td>\n",
" <td>108.777870</td>\n",
" <td>374.865069</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>2019-05-21 08:03:52+00:00</td>\n",
" <td>fr</td>\n",
" <td>female</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>160520</td>\n",
" <td>207.0</td>\n",
" <td>5.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>431.550012</td>\n",
" <td>69.310266</td>\n",
" <td>362.239745</td>\n",
" <td>0.0</td>\n",
" <td>...</td>\n",
" <td>2019-08-20 15:10:07+00:00</td>\n",
" <td>fr</td>\n",
" <td>male</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 39 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id nb_tickets nb_purchases total_amount nb_suppliers \\\n",
"0 160516 149.0 3.0 4470.0 1.0 \n",
"1 160517 1977.0 27.0 1473.0 2.0 \n",
"2 160518 116.0 8.0 439.0 2.0 \n",
"3 160519 34.0 2.0 608.0 1.0 \n",
"4 160520 207.0 5.0 0.0 1.0 \n",
"\n",
" vente_internet_max purchase_date_min purchase_date_max \\\n",
"0 0.0 409.693137 66.356979 \n",
"1 1.0 431.558519 27.733472 \n",
"2 0.0 427.177720 23.689340 \n",
"3 0.0 483.642940 108.777870 \n",
"4 0.0 431.550012 69.310266 \n",
"\n",
" time_between_purchase nb_tickets_internet ... first_buying_date \\\n",
"0 343.336157 0.0 ... 2021-09-17 06:39:19+00:00 \n",
"1 403.825046 15.0 ... 2021-08-26 09:53:10+00:00 \n",
"2 403.488380 0.0 ... 2021-08-30 19:01:31+00:00 \n",
"3 374.865069 0.0 ... 2019-05-21 08:03:52+00:00 \n",
"4 362.239745 0.0 ... 2019-08-20 15:10:07+00:00 \n",
"\n",
" country gender_label gender_female gender_male gender_other \\\n",
"0 fr male 0 1 0 \n",
"1 fr female 1 0 0 \n",
"2 fr male 0 1 0 \n",
"3 fr female 1 0 0 \n",
"4 fr male 0 1 0 \n",
"\n",
" country_fr nb_campaigns nb_campaigns_opened time_to_open \n",
"0 1.0 0.0 0.0 NaT \n",
"1 1.0 0.0 0.0 NaT \n",
"2 1.0 0.0 0.0 NaT \n",
"3 1.0 0.0 0.0 NaT \n",
"4 1.0 0.0 0.0 NaT \n",
"\n",
"[5 rows x 39 columns]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n",
"# Fusion avec KPI liés au comportement d'achat\n",
"df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n",
"df_customer_product.head()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "5549e265-3904-464b-964b-518a84a42503",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>customer_id</th>\n",
" <th>purchase_id</th>\n",
" <th>event_type_id</th>\n",
" <th>supplier_name</th>\n",
" <th>purchase_date</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilities</th>\n",
" <th>name_categories</th>\n",
" <th>name_events</th>\n",
" <th>name_seasons</th>\n",
" <th>start_date_time</th>\n",
" <th>end_date_time</th>\n",
" <th>open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [ticket_id, customer_id, purchase_id, event_type_id, supplier_name, purchase_date, amount, is_full_price, name_event_types, name_facilities, name_categories, name_events, name_seasons, start_date_time, end_date_time, open]\n",
"Index: []"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fill NaN values\n",
"df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n",
"\n",
"# 2. Construction of the explained variable \n",
"df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n",
"df_products_purchased_to_predict.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "be182c6c-012f-447d-a57f-03da65da53f7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<DatetimeArray>\n",
"['2022-03-31 03:42:59+00:00', '2022-02-28 16:31:29+00:00',\n",
" '2022-03-31 04:00:22+00:00', '2022-03-31 04:09:18+00:00',\n",
" '2022-03-25 15:50:52+00:00', '2022-08-01 10:05:49+00:00',\n",
" '2021-08-26 12:17:40+00:00', '2022-08-02 06:32:37+00:00',\n",
" '2022-06-30 09:16:59+00:00', '2022-07-03 13:53:30+00:00',\n",
" ...\n",
" '2022-01-26 11:34:05+00:00', '2022-01-21 17:07:25+00:00',\n",
" '2022-01-26 13:43:23+00:00', '2022-01-26 14:38:05+00:00',\n",
" '2022-01-26 14:39:19+00:00', '2022-01-26 14:40:12+00:00',\n",
" '2022-01-26 14:41:17+00:00', '2022-01-27 08:16:02+00:00',\n",
" '2022-01-27 08:45:25+00:00', '2022-01-27 11:57:11+00:00']\n",
"Length: 49543, dtype: datetime64[ns, UTC]"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_products_purchased_reduced['purchase_date'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "aab1cc7e-79be-403c-b9c1-4f4f333b13ff",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>customer_id</th>\n",
" <th>purchase_id</th>\n",
" <th>event_type_id</th>\n",
" <th>supplier_name</th>\n",
" <th>purchase_date</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilities</th>\n",
" <th>name_categories</th>\n",
" <th>name_events</th>\n",
" <th>name_seasons</th>\n",
" <th>start_date_time</th>\n",
" <th>end_date_time</th>\n",
" <th>open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6287839</td>\n",
" <td>204007</td>\n",
" <td>545836.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-03-31 03:42:59+00:00</td>\n",
" <td>55.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>6287840</td>\n",
" <td>204007</td>\n",
" <td>545836.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-03-31 03:42:59+00:00</td>\n",
" <td>30.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6154548</td>\n",
" <td>227006</td>\n",
" <td>535225.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-02-28 16:31:29+00:00</td>\n",
" <td>55.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6154549</td>\n",
" <td>227006</td>\n",
" <td>535225.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-02-28 16:31:29+00:00</td>\n",
" <td>55.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6287843</td>\n",
" <td>407930</td>\n",
" <td>545838.0</td>\n",
" <td>824</td>\n",
" <td>fov</td>\n",
" <td>2022-03-31 04:00:22+00:00</td>\n",
" <td>55.0</td>\n",
" <td>False</td>\n",
" <td>match rugby</td>\n",
" <td>jean bouin</td>\n",
" <td>centrale</td>\n",
" <td>sf paris / racing 92 (ercc)</td>\n",
" <td>saison 2021 - 2022</td>\n",
" <td>2022-04-08 22:00:00+02:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ticket_id customer_id purchase_id event_type_id supplier_name \\\n",
"0 6287839 204007 545836.0 824 fov \n",
"1 6287840 204007 545836.0 824 fov \n",
"2 6154548 227006 535225.0 824 fov \n",
"3 6154549 227006 535225.0 824 fov \n",
"4 6287843 407930 545838.0 824 fov \n",
"\n",
" purchase_date amount is_full_price name_event_types \\\n",
"0 2022-03-31 03:42:59+00:00 55.0 False match rugby \n",
"1 2022-03-31 03:42:59+00:00 30.0 False match rugby \n",
"2 2022-02-28 16:31:29+00:00 55.0 False match rugby \n",
"3 2022-02-28 16:31:29+00:00 55.0 False match rugby \n",
"4 2022-03-31 04:00:22+00:00 55.0 False match rugby \n",
"\n",
" name_facilities name_categories name_events \\\n",
"0 jean bouin centrale sf paris / racing 92 (ercc) \n",
"1 jean bouin centrale sf paris / racing 92 (ercc) \n",
"2 jean bouin centrale sf paris / racing 92 (ercc) \n",
"3 jean bouin centrale sf paris / racing 92 (ercc) \n",
"4 jean bouin centrale sf paris / racing 92 (ercc) \n",
"\n",
" name_seasons start_date_time end_date_time \\\n",
"0 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"1 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"2 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"3 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"4 saison 2021 - 2022 2022-04-08 22:00:00+02:00 1901-01-01 00:09:21+00:09 \n",
"\n",
" open \n",
"0 True \n",
"1 True \n",
"2 True \n",
"3 True \n",
"4 True "
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date)].head()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "ce59de67-127e-4b0a-b96c-9684d87792dd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Timestamp('2022-10-31 23:17:26+0000', tz='UTC')"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_products_purchased_reduced['purchase_date'].max()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "184463d1-b0dd-44b9-a9a3-4ab32c8c13c1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}