Merge pull request 'generalization' (#5) from generalization into main
Reviewed-on: #5
This commit is contained in:
		
						commit
						282d6cd8a5
					
				|  | @ -6,6 +6,7 @@ import os | ||||||
| import s3fs | import s3fs | ||||||
| import re | import re | ||||||
| import warnings | import warnings | ||||||
|  | from datetime import date, timedelta, datetime | ||||||
| 
 | 
 | ||||||
| # Create filesystem object | # Create filesystem object | ||||||
| S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] | S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] | ||||||
|  | @ -18,6 +19,47 @@ exec(open('0_KPI_functions.py').read()) | ||||||
| # Ignore warning | # Ignore warning | ||||||
| warnings.filterwarnings('ignore') | warnings.filterwarnings('ignore') | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | def display_covering_time(df, company, datecover): | ||||||
|  |     """ | ||||||
|  |     This function draws the time coverage of each company | ||||||
|  |     """ | ||||||
|  |     min_date = df['purchase_date'].min().strftime("%Y-%m-%d") | ||||||
|  |     max_date = df['purchase_date'].max().strftime("%Y-%m-%d") | ||||||
|  |     datecover[company] = [datetime.strptime(min_date, "%Y-%m-%d") + timedelta(days=x) for x in range((datetime.strptime(max_date, "%Y-%m-%d") - datetime.strptime(min_date, "%Y-%m-%d")).days)] | ||||||
|  |     print(f'Couverture Company {company} : {min_date} - {max_date}') | ||||||
|  |     return datecover | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def compute_time_intersection(datecover): | ||||||
|  |     """ | ||||||
|  |     This function returns the time coverage for all companies | ||||||
|  |     """ | ||||||
|  |     timestamps_sets = [set(timestamps) for timestamps in datecover.values()] | ||||||
|  |     intersection = set.intersection(*timestamps_sets) | ||||||
|  |     intersection_list = list(intersection) | ||||||
|  |     formated_dates = [dt.strftime("%Y-%m-%d") for dt in intersection_list] | ||||||
|  |     return sorted(formated_dates) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def df_coverage_modelization(sport, coverage_train = 0.7): | ||||||
|  |     """ | ||||||
|  |     This function returns start_date, end_of_features and final dates | ||||||
|  |     that help to construct train and test datasets | ||||||
|  |     """ | ||||||
|  |     datecover = {} | ||||||
|  |     for company in sport: | ||||||
|  |         df_products_purchased_reduced = display_databases(company, file_name = "products_purchased_reduced", | ||||||
|  |                                                           datetime_col = ['purchase_date']) | ||||||
|  |         datecover = display_covering_time(df_products_purchased_reduced, company, datecover) | ||||||
|  |     #print(datecover.keys()) | ||||||
|  |     dt_coverage = compute_time_intersection(datecover) | ||||||
|  |     start_date = dt_coverage[0] | ||||||
|  |     end_of_features = dt_coverage[int(0.7 * len(dt_coverage))] | ||||||
|  |     final_date = dt_coverage[-1] | ||||||
|  |     return start_date, end_of_features, final_date | ||||||
|  |      | ||||||
|  | 
 | ||||||
| def dataset_construction(min_date, end_features_date, max_date, directory_path): | def dataset_construction(min_date, end_features_date, max_date, directory_path): | ||||||
|      |      | ||||||
|     # Import customerplus |     # Import customerplus | ||||||
|  | @ -97,14 +139,25 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path): | ||||||
| 
 | 
 | ||||||
| ## Exportation | ## Exportation | ||||||
| 
 | 
 | ||||||
| # Dossier d'exportation | companies = {'musee' : ['1', '2', '3', '4', '101'], | ||||||
| BUCKET_OUT = "projet-bdc2324-team1/2_Output/Logistique Regression databases - First approach" |             'sport': ['5', '6', '7', '8', '9'], | ||||||
|  |             'musique' : ['10', '11', '12', '13', '14']} | ||||||
| 
 | 
 | ||||||
| # Dataset test | type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?') | ||||||
| dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1") | list_of_comp = companies[type_of_comp]  | ||||||
|  | # Dossier d'exportation | ||||||
|  | BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}' | ||||||
|  | 
 | ||||||
|  | # Create test dataset and train dataset for sport companies | ||||||
|  | 
 | ||||||
|  | start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7) | ||||||
|  | 
 | ||||||
|  | for company in list_of_comp: | ||||||
|  |     dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features, | ||||||
|  |                                         max_date = final_date, directory_path = company)     | ||||||
| 
 | 
 | ||||||
|     # Exportation |     # Exportation | ||||||
| FILE_KEY_OUT_S3 = "dataset_test.csv" |     FILE_KEY_OUT_S3 = "dataset_test" + company +  ".csv" | ||||||
|     FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 |     FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 | ||||||
|      |      | ||||||
|     with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: |     with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: | ||||||
|  | @ -113,10 +166,10 @@ with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: | ||||||
|     print("Exportation dataset test : SUCCESS") |     print("Exportation dataset test : SUCCESS") | ||||||
| 
 | 
 | ||||||
| # Dataset train | # Dataset train | ||||||
| dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1") |     dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features, | ||||||
| 
 |                                         max_date = final_date, directory_path = company) | ||||||
|     # Export |     # Export | ||||||
| FILE_KEY_OUT_S3 = "dataset_train.csv" |     FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv"  | ||||||
|     FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 |     FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 | ||||||
|      |      | ||||||
|     with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: |     with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: | ||||||
|  |  | ||||||
|  | @ -46,7 +46,7 @@ def tickets_kpi_function(tickets_information = None): | ||||||
|      |      | ||||||
|     # Dummy : Canal de vente en ligne |     # Dummy : Canal de vente en ligne | ||||||
|     liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance |     liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance | ||||||
|     tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int) |     tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int) | ||||||
| 
 | 
 | ||||||
|     # Proportion de vente en ligne |     # Proportion de vente en ligne | ||||||
|     prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index() |     prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index() | ||||||
|  |  | ||||||
							
								
								
									
										854
									
								
								Sport/exploration_sport.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										854
									
								
								Sport/exploration_sport.ipynb
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,854 @@ | ||||||
|  | { | ||||||
|  |  "cells": [ | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 114, | ||||||
|  |    "id": "314bf34b-1f6d-4a99-8f82-aa71ebacdabc", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "import pandas as pd\n", | ||||||
|  |     "import os\n", | ||||||
|  |     "import s3fs\n", | ||||||
|  |     "import warnings\n", | ||||||
|  |     "from datetime import date, timedelta, datetime\n", | ||||||
|  |     "import numpy as np\n", | ||||||
|  |     "\n", | ||||||
|  |     "exec(open('../0_KPI_functions.py').read())" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 33, | ||||||
|  |    "id": "a276822a-c389-429e-b249-8a9e47758bfc", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "# Ignore warning\n", | ||||||
|  |     "warnings.filterwarnings('ignore')" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 34, | ||||||
|  |    "id": "f62b996c-4e17-40ea-83ba-f0cb60be7671", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "['bdc2324-data/1',\n", | ||||||
|  |        " 'bdc2324-data/10',\n", | ||||||
|  |        " 'bdc2324-data/101',\n", | ||||||
|  |        " 'bdc2324-data/11',\n", | ||||||
|  |        " 'bdc2324-data/12',\n", | ||||||
|  |        " 'bdc2324-data/13',\n", | ||||||
|  |        " 'bdc2324-data/14',\n", | ||||||
|  |        " 'bdc2324-data/2',\n", | ||||||
|  |        " 'bdc2324-data/3',\n", | ||||||
|  |        " 'bdc2324-data/4',\n", | ||||||
|  |        " 'bdc2324-data/5',\n", | ||||||
|  |        " 'bdc2324-data/6',\n", | ||||||
|  |        " 'bdc2324-data/7',\n", | ||||||
|  |        " 'bdc2324-data/8',\n", | ||||||
|  |        " 'bdc2324-data/9']" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 34, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "# Create filesystem object\n", | ||||||
|  |     "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", | ||||||
|  |     "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", | ||||||
|  |     "\n", | ||||||
|  |     "BUCKET = \"bdc2324-data\"\n", | ||||||
|  |     "fs.ls(BUCKET)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "2c829aa8-2006-4e72-889b-7096dd55718b", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "## Look at the time sequence of each company and compute inter time coverage" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 73, | ||||||
|  |    "id": "e86864b7-4852-449a-8680-638559d56080", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "sport = ['5', '6', '7', '8', '9']" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 90, | ||||||
|  |    "id": "7634ec57-4891-4684-8638-1e1643baca28", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "def display_covering_time(df, company, datecover):\n", | ||||||
|  |     "    \"\"\"\n", | ||||||
|  |     "    This function draws the time coverage of each company\n", | ||||||
|  |     "    \"\"\"\n", | ||||||
|  |     "    min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n", | ||||||
|  |     "    max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n", | ||||||
|  |     "    datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n", | ||||||
|  |     "    print(f'Couverture Company {company} : {min_date} - {max_date}')\n", | ||||||
|  |     "    return datecover" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 91, | ||||||
|  |    "id": "53c83f51-822c-4e05-8c7c-89aa327603c6", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "def compute_time_intersection(datecover):\n", | ||||||
|  |     "    timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n", | ||||||
|  |     "    intersection = set.intersection(*timestamps_sets)\n", | ||||||
|  |     "    intersection_list = list(intersection)\n", | ||||||
|  |     "    formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n", | ||||||
|  |     "    return sorted(formated_dates)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 93, | ||||||
|  |    "id": "eec152de-078e-44c4-ad6e-74ae6ba5c65a", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "def df_coverage_modelization(sport, coverage_train = 0.7):\n", | ||||||
|  |     "    \"\"\"\n", | ||||||
|  |     "    This function returns start_date, end_of_features and final dates\n", | ||||||
|  |     "    that help to construct train and test datasets\n", | ||||||
|  |     "    \"\"\"\n", | ||||||
|  |     "    datecover = {}\n", | ||||||
|  |     "    for company in sport:\n", | ||||||
|  |     "        df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n", | ||||||
|  |     "                                                          datetime_col = ['purchase_date'])\n", | ||||||
|  |     "        datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n", | ||||||
|  |     "    #print(datecover.keys())\n", | ||||||
|  |     "    dt_coverage = compute_time_intersection(datecover)\n", | ||||||
|  |     "    start_date = dt_coverage[0]\n", | ||||||
|  |     "    end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n", | ||||||
|  |     "    final_date = dt_coverage[-1]\n", | ||||||
|  |     "    return start_date, end_of_features, final_date\n", | ||||||
|  |     "    " | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 94, | ||||||
|  |    "id": "348f246a-bc2d-4bbc-ba05-aa825da15a69", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "File path :  projet-bdc2324-team1/0_Input/Company_5/products_purchased_reduced.csv\n", | ||||||
|  |       "Couverture Company 5 : 2019-04-15 - 2023-11-09\n", | ||||||
|  |       "File path :  projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n", | ||||||
|  |       "Couverture Company 6 : 2018-06-28 - 2023-11-08\n", | ||||||
|  |       "File path :  projet-bdc2324-team1/0_Input/Company_7/products_purchased_reduced.csv\n", | ||||||
|  |       "Couverture Company 7 : 2015-02-10 - 2023-11-08\n", | ||||||
|  |       "File path :  projet-bdc2324-team1/0_Input/Company_8/products_purchased_reduced.csv\n", | ||||||
|  |       "Couverture Company 8 : 2010-09-28 - 2023-11-08\n", | ||||||
|  |       "File path :  projet-bdc2324-team1/0_Input/Company_9/products_purchased_reduced.csv\n", | ||||||
|  |       "Couverture Company 9 : 2014-09-22 - 2023-10-24\n", | ||||||
|  |       "dict_keys(['5', '6', '7', '8', '9'])\n", | ||||||
|  |       "2019-04-15 2022-06-15 2023-10-23\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "start_date, end_of_features, final_date = df_coverage_modelization(sport, coverage_train = 0.7)\n", | ||||||
|  |     "print(start_date, end_of_features, final_date )" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "34ddc267-4daa-4926-9d54-5b13d4212eaa", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "## Look at common database between Sport companies" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 101, | ||||||
|  |    "id": "389387fa-2046-4811-b8dd-6d524e91fe2e", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "['bdc2324-data/5',\n", | ||||||
|  |        " 'bdc2324-data/6',\n", | ||||||
|  |        " 'bdc2324-data/7',\n", | ||||||
|  |        " 'bdc2324-data/8',\n", | ||||||
|  |        " 'bdc2324-data/9']" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 101, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "companies = fs.ls(BUCKET)\n", | ||||||
|  |     "companies = [company for company in companies if any(company.endswith(end) for end in sport)]\n", | ||||||
|  |     "companies" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 107, | ||||||
|  |    "id": "895fc2b3-c768-454d-bedb-54994e4d211a", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "Number of databases :  30\n", | ||||||
|  |       "Number of common databases :  23\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "companies_database = {}\n", | ||||||
|  |     "\n", | ||||||
|  |     "for company in companies:\n", | ||||||
|  |     "    companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n", | ||||||
|  |     "\n", | ||||||
|  |     "all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n", | ||||||
|  |     "print(\"Number of databases : \",len(all_database))\n", | ||||||
|  |     "\n", | ||||||
|  |     "data_in_common = set(all_database)\n", | ||||||
|  |     "\n", | ||||||
|  |     "for key in companies_database:\n", | ||||||
|  |     "    diff_database = data_in_common.symmetric_difference(companies_database[key])\n", | ||||||
|  |     "    data_in_common = data_in_common - diff_database\n", | ||||||
|  |     "\n", | ||||||
|  |     "print(\"Number of common databases : \",len(data_in_common))" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 121, | ||||||
|  |    "id": "0c06517d-f5b7-4104-94fa-0e3f843c5881", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "{'campaign_stats.csv',\n", | ||||||
|  |        " 'campaigns.csv',\n", | ||||||
|  |        " 'categories.csv',\n", | ||||||
|  |        " 'countries.csv',\n", | ||||||
|  |        " 'currencies.csv',\n", | ||||||
|  |        " 'customer_target_mappings.csv',\n", | ||||||
|  |        " 'customersplus.csv',\n", | ||||||
|  |        " 'event_types.csv',\n", | ||||||
|  |        " 'events.csv',\n", | ||||||
|  |        " 'facilities.csv',\n", | ||||||
|  |        " 'link_stats.csv',\n", | ||||||
|  |        " 'pricing_formulas.csv',\n", | ||||||
|  |        " 'product_packs.csv',\n", | ||||||
|  |        " 'products.csv',\n", | ||||||
|  |        " 'products_groups.csv',\n", | ||||||
|  |        " 'purchases.csv',\n", | ||||||
|  |        " 'representation_category_capacities.csv',\n", | ||||||
|  |        " 'representations.csv',\n", | ||||||
|  |        " 'seasons.csv',\n", | ||||||
|  |        " 'suppliers.csv',\n", | ||||||
|  |        " 'target_types.csv',\n", | ||||||
|  |        " 'targets.csv',\n", | ||||||
|  |        " 'tickets.csv'}" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 121, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "data_in_common" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "1af245aa-44a7-453b-90f9-0c4bcc415cd0", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "## Investigate errors from data construction for company 6" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 108, | ||||||
|  |    "id": "538a5ca2-a50d-4726-93eb-c2b0d0ab8400", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "directory_path = '6'" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 143, | ||||||
|  |    "id": "1ca3fb71-930a-441c-b35b-b98bca780606", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "File path :  projet-bdc2324-team1/0_Input/Company_6/customerplus_cleaned.csv\n", | ||||||
|  |       "File path :  projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n", | ||||||
|  |       "File path :  projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "df_customerplus_clean = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n", | ||||||
|  |     "df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n", | ||||||
|  |     "df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 144, | ||||||
|  |    "id": "2ad3052c-e9e6-4ef9-abe2-4b8b2306a2b9", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "max_date =  pd.to_datetime(final_date, utc = True, format = 'ISO8601') \n", | ||||||
|  |     "end_features_date = pd.to_datetime(end_of_features, utc = True, format = 'ISO8601')\n", | ||||||
|  |     "min_date = pd.to_datetime(start_date, utc = True, format = 'ISO8601')" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 128, | ||||||
|  |    "id": "146999f2-ab92-4b7c-8c57-2e3ac8c4dd88", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "File path :  projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n" | ||||||
|  |      ] | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 133, | ||||||
|  |    "id": "7448a7b9-3edf-4177-9df2-a260ebbee45e", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "Timestamp('2022-06-15 00:00:00+0000', tz='UTC')" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 133, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "end_features_date" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 136, | ||||||
|  |    "id": "d8e954ab-65d4-4f36-8410-69bf664773a7", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "name": "stdout", | ||||||
|  |      "output_type": "stream", | ||||||
|  |      "text": [ | ||||||
|  |       "Shape campaigns_information :  (1333010, 8)\n" | ||||||
|  |      ] | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/html": [ | ||||||
|  |        "<div>\n", | ||||||
|  |        "<style scoped>\n", | ||||||
|  |        "    .dataframe tbody tr th:only-of-type {\n", | ||||||
|  |        "        vertical-align: middle;\n", | ||||||
|  |        "    }\n", | ||||||
|  |        "\n", | ||||||
|  |        "    .dataframe tbody tr th {\n", | ||||||
|  |        "        vertical-align: top;\n", | ||||||
|  |        "    }\n", | ||||||
|  |        "\n", | ||||||
|  |        "    .dataframe thead th {\n", | ||||||
|  |        "        text-align: right;\n", | ||||||
|  |        "    }\n", | ||||||
|  |        "</style>\n", | ||||||
|  |        "<table border=\"1\" class=\"dataframe\">\n", | ||||||
|  |        "  <thead>\n", | ||||||
|  |        "    <tr style=\"text-align: right;\">\n", | ||||||
|  |        "      <th></th>\n", | ||||||
|  |        "      <th>id</th>\n", | ||||||
|  |        "      <th>customer_id</th>\n", | ||||||
|  |        "      <th>opened_at</th>\n", | ||||||
|  |        "      <th>sent_at</th>\n", | ||||||
|  |        "      <th>delivered_at</th>\n", | ||||||
|  |        "      <th>campaign_name</th>\n", | ||||||
|  |        "      <th>campaign_service_id</th>\n", | ||||||
|  |        "      <th>campaign_sent_at</th>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "  </thead>\n", | ||||||
|  |        "  <tbody>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>0</th>\n", | ||||||
|  |        "      <td>1</td>\n", | ||||||
|  |        "      <td>38</td>\n", | ||||||
|  |        "      <td>NaT</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:33+00:00</td>\n", | ||||||
|  |        "      <td>NaN</td>\n", | ||||||
|  |        "      <td>Adhérents non ré-engagés</td>\n", | ||||||
|  |        "      <td>15</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:36+00:00</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>1</th>\n", | ||||||
|  |        "      <td>2</td>\n", | ||||||
|  |        "      <td>26135</td>\n", | ||||||
|  |        "      <td>NaT</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:34+00:00</td>\n", | ||||||
|  |        "      <td>NaN</td>\n", | ||||||
|  |        "      <td>Adhérents non ré-engagés</td>\n", | ||||||
|  |        "      <td>15</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:36+00:00</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>2</th>\n", | ||||||
|  |        "      <td>3</td>\n", | ||||||
|  |        "      <td>3876</td>\n", | ||||||
|  |        "      <td>NaT</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:35+00:00</td>\n", | ||||||
|  |        "      <td>NaN</td>\n", | ||||||
|  |        "      <td>Adhérents non ré-engagés</td>\n", | ||||||
|  |        "      <td>15</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:36+00:00</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>3</th>\n", | ||||||
|  |        "      <td>4</td>\n", | ||||||
|  |        "      <td>26226</td>\n", | ||||||
|  |        "      <td>NaT</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:35+00:00</td>\n", | ||||||
|  |        "      <td>NaN</td>\n", | ||||||
|  |        "      <td>Adhérents non ré-engagés</td>\n", | ||||||
|  |        "      <td>15</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:36+00:00</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>4</th>\n", | ||||||
|  |        "      <td>5</td>\n", | ||||||
|  |        "      <td>25349</td>\n", | ||||||
|  |        "      <td>NaT</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:34+00:00</td>\n", | ||||||
|  |        "      <td>NaN</td>\n", | ||||||
|  |        "      <td>Adhérents non ré-engagés</td>\n", | ||||||
|  |        "      <td>15</td>\n", | ||||||
|  |        "      <td>2022-08-02 18:31:36+00:00</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "  </tbody>\n", | ||||||
|  |        "</table>\n", | ||||||
|  |        "</div>" | ||||||
|  |       ], | ||||||
|  |       "text/plain": [ | ||||||
|  |        "   id  customer_id opened_at                   sent_at delivered_at  \\\n", | ||||||
|  |        "0   1           38       NaT 2022-08-02 18:31:33+00:00          NaN   \n", | ||||||
|  |        "1   2        26135       NaT 2022-08-02 18:31:34+00:00          NaN   \n", | ||||||
|  |        "2   3         3876       NaT 2022-08-02 18:31:35+00:00          NaN   \n", | ||||||
|  |        "3   4        26226       NaT 2022-08-02 18:31:35+00:00          NaN   \n", | ||||||
|  |        "4   5        25349       NaT 2022-08-02 18:31:34+00:00          NaN   \n", | ||||||
|  |        "\n", | ||||||
|  |        "              campaign_name  campaign_service_id          campaign_sent_at  \n", | ||||||
|  |        "0  Adhérents non ré-engagés                   15 2022-08-02 18:31:36+00:00  \n", | ||||||
|  |        "1  Adhérents non ré-engagés                   15 2022-08-02 18:31:36+00:00  \n", | ||||||
|  |        "2  Adhérents non ré-engagés                   15 2022-08-02 18:31:36+00:00  \n", | ||||||
|  |        "3  Adhérents non ré-engagés                   15 2022-08-02 18:31:36+00:00  \n", | ||||||
|  |        "4  Adhérents non ré-engagés                   15 2022-08-02 18:31:36+00:00  " | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 136, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "print(\"Shape campaigns_information : \", df_campaigns_information.shape)\n", | ||||||
|  |     "df_campaigns_information.head()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 134, | ||||||
|  |    "id": "93eceaf1-ce4c-4dfa-9c51-4fd016d09fc5", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "Timestamp('2022-08-02 18:31:33+0000', tz='UTC')" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 134, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "df_campaigns_information['sent_at'].min()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 137, | ||||||
|  |    "id": "ea50cab4-1dae-4efe-ae3c-22b6f9ad1d26", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "Timestamp('2023-11-07 10:08:16+0000', tz='UTC')" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 137, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "df_campaigns_information['sent_at'].max()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 127, | ||||||
|  |    "id": "dcb87bc9-caf5-4655-9cfa-4a3dad504bac", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/html": [ | ||||||
|  |        "<div>\n", | ||||||
|  |        "<style scoped>\n", | ||||||
|  |        "    .dataframe tbody tr th:only-of-type {\n", | ||||||
|  |        "        vertical-align: middle;\n", | ||||||
|  |        "    }\n", | ||||||
|  |        "\n", | ||||||
|  |        "    .dataframe tbody tr th {\n", | ||||||
|  |        "        vertical-align: top;\n", | ||||||
|  |        "    }\n", | ||||||
|  |        "\n", | ||||||
|  |        "    .dataframe thead th {\n", | ||||||
|  |        "        text-align: right;\n", | ||||||
|  |        "    }\n", | ||||||
|  |        "</style>\n", | ||||||
|  |        "<table border=\"1\" class=\"dataframe\">\n", | ||||||
|  |        "  <thead>\n", | ||||||
|  |        "    <tr style=\"text-align: right;\">\n", | ||||||
|  |        "      <th></th>\n", | ||||||
|  |        "      <th>id</th>\n", | ||||||
|  |        "      <th>customer_id</th>\n", | ||||||
|  |        "      <th>opened_at</th>\n", | ||||||
|  |        "      <th>sent_at</th>\n", | ||||||
|  |        "      <th>delivered_at</th>\n", | ||||||
|  |        "      <th>campaign_name</th>\n", | ||||||
|  |        "      <th>campaign_service_id</th>\n", | ||||||
|  |        "      <th>campaign_sent_at</th>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "  </thead>\n", | ||||||
|  |        "  <tbody>\n", | ||||||
|  |        "  </tbody>\n", | ||||||
|  |        "</table>\n", | ||||||
|  |        "</div>" | ||||||
|  |       ], | ||||||
|  |       "text/plain": [ | ||||||
|  |        "Empty DataFrame\n", | ||||||
|  |        "Columns: [id, customer_id, opened_at, sent_at, delivered_at, campaign_name, campaign_service_id, campaign_sent_at]\n", | ||||||
|  |        "Index: []" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 127, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "#Filtre de la base df_campaigns_information\n", | ||||||
|  |     "df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n", | ||||||
|  |     "df_campaigns_information" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 145, | ||||||
|  |    "id": "abe22e09-a041-4349-be8f-b0784f2f0a98", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/html": [ | ||||||
|  |        "<div>\n", | ||||||
|  |        "<style scoped>\n", | ||||||
|  |        "    .dataframe tbody tr th:only-of-type {\n", | ||||||
|  |        "        vertical-align: middle;\n", | ||||||
|  |        "    }\n", | ||||||
|  |        "\n", | ||||||
|  |        "    .dataframe tbody tr th {\n", | ||||||
|  |        "        vertical-align: top;\n", | ||||||
|  |        "    }\n", | ||||||
|  |        "\n", | ||||||
|  |        "    .dataframe thead th {\n", | ||||||
|  |        "        text-align: right;\n", | ||||||
|  |        "    }\n", | ||||||
|  |        "</style>\n", | ||||||
|  |        "<table border=\"1\" class=\"dataframe\">\n", | ||||||
|  |        "  <thead>\n", | ||||||
|  |        "    <tr style=\"text-align: right;\">\n", | ||||||
|  |        "      <th></th>\n", | ||||||
|  |        "      <th>ticket_id</th>\n", | ||||||
|  |        "      <th>customer_id</th>\n", | ||||||
|  |        "      <th>purchase_id</th>\n", | ||||||
|  |        "      <th>event_type_id</th>\n", | ||||||
|  |        "      <th>supplier_name</th>\n", | ||||||
|  |        "      <th>purchase_date</th>\n", | ||||||
|  |        "      <th>amount</th>\n", | ||||||
|  |        "      <th>is_full_price</th>\n", | ||||||
|  |        "      <th>name_event_types</th>\n", | ||||||
|  |        "      <th>name_facilities</th>\n", | ||||||
|  |        "      <th>name_categories</th>\n", | ||||||
|  |        "      <th>name_events</th>\n", | ||||||
|  |        "      <th>name_seasons</th>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "  </thead>\n", | ||||||
|  |        "  <tbody>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>49</th>\n", | ||||||
|  |        "      <td>91401</td>\n", | ||||||
|  |        "      <td>108392</td>\n", | ||||||
|  |        "      <td>1259025.0</td>\n", | ||||||
|  |        "      <td>4</td>\n", | ||||||
|  |        "      <td>caisse</td>\n", | ||||||
|  |        "      <td>2022-02-27 13:44:10.690000+00:00</td>\n", | ||||||
|  |        "      <td>0.0</td>\n", | ||||||
|  |        "      <td>False</td>\n", | ||||||
|  |        "      <td>ligue 1 uber eats</td>\n", | ||||||
|  |        "      <td>stade de l'aube</td>\n", | ||||||
|  |        "      <td>honneur basse</td>\n", | ||||||
|  |        "      <td>olympique de marseille</td>\n", | ||||||
|  |        "      <td>saison 2021-2022</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>117</th>\n", | ||||||
|  |        "      <td>535527</td>\n", | ||||||
|  |        "      <td>31304</td>\n", | ||||||
|  |        "      <td>136629.0</td>\n", | ||||||
|  |        "      <td>4</td>\n", | ||||||
|  |        "      <td>adhésion</td>\n", | ||||||
|  |        "      <td>2022-04-28 15:47:52.790000+00:00</td>\n", | ||||||
|  |        "      <td>0.0</td>\n", | ||||||
|  |        "      <td>False</td>\n", | ||||||
|  |        "      <td>ligue 1 uber eats</td>\n", | ||||||
|  |        "      <td>stade de l'aube</td>\n", | ||||||
|  |        "      <td>honneur basse</td>\n", | ||||||
|  |        "      <td>ac ajaccio</td>\n", | ||||||
|  |        "      <td>saison 2022-2023</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>274</th>\n", | ||||||
|  |        "      <td>547400</td>\n", | ||||||
|  |        "      <td>192</td>\n", | ||||||
|  |        "      <td>140477.0</td>\n", | ||||||
|  |        "      <td>4</td>\n", | ||||||
|  |        "      <td>adhésion</td>\n", | ||||||
|  |        "      <td>2022-04-28 15:47:54.053000+00:00</td>\n", | ||||||
|  |        "      <td>0.0</td>\n", | ||||||
|  |        "      <td>False</td>\n", | ||||||
|  |        "      <td>ligue 1 uber eats</td>\n", | ||||||
|  |        "      <td>stade de l'aube</td>\n", | ||||||
|  |        "      <td>honneur basse</td>\n", | ||||||
|  |        "      <td>rc strasbourg</td>\n", | ||||||
|  |        "      <td>saison 2022-2023</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>304</th>\n", | ||||||
|  |        "      <td>84413</td>\n", | ||||||
|  |        "      <td>31388</td>\n", | ||||||
|  |        "      <td>20259.0</td>\n", | ||||||
|  |        "      <td>4</td>\n", | ||||||
|  |        "      <td>adhésion</td>\n", | ||||||
|  |        "      <td>2021-08-03 13:45:01.603000+00:00</td>\n", | ||||||
|  |        "      <td>0.0</td>\n", | ||||||
|  |        "      <td>False</td>\n", | ||||||
|  |        "      <td>ligue 1 uber eats</td>\n", | ||||||
|  |        "      <td>stade de l'aube</td>\n", | ||||||
|  |        "      <td>vitoux haute</td>\n", | ||||||
|  |        "      <td>olympique de marseille</td>\n", | ||||||
|  |        "      <td>saison 2021-2022</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "    <tr>\n", | ||||||
|  |        "      <th>311</th>\n", | ||||||
|  |        "      <td>407271</td>\n", | ||||||
|  |        "      <td>3265</td>\n", | ||||||
|  |        "      <td>90527.0</td>\n", | ||||||
|  |        "      <td>4</td>\n", | ||||||
|  |        "      <td>web [adhésion]</td>\n", | ||||||
|  |        "      <td>2022-05-26 09:15:40.993000+00:00</td>\n", | ||||||
|  |        "      <td>0.0</td>\n", | ||||||
|  |        "      <td>False</td>\n", | ||||||
|  |        "      <td>ligue 1 uber eats</td>\n", | ||||||
|  |        "      <td>stade de l'aube</td>\n", | ||||||
|  |        "      <td>champagne basse</td>\n", | ||||||
|  |        "      <td>stade brestois 29</td>\n", | ||||||
|  |        "      <td>saison 2022-2023</td>\n", | ||||||
|  |        "    </tr>\n", | ||||||
|  |        "  </tbody>\n", | ||||||
|  |        "</table>\n", | ||||||
|  |        "</div>" | ||||||
|  |       ], | ||||||
|  |       "text/plain": [ | ||||||
|  |        "     ticket_id  customer_id  purchase_id  event_type_id   supplier_name  \\\n", | ||||||
|  |        "49       91401       108392    1259025.0              4          caisse   \n", | ||||||
|  |        "117     535527        31304     136629.0              4        adhésion   \n", | ||||||
|  |        "274     547400          192     140477.0              4        adhésion   \n", | ||||||
|  |        "304      84413        31388      20259.0              4        adhésion   \n", | ||||||
|  |        "311     407271         3265      90527.0              4  web [adhésion]   \n", | ||||||
|  |        "\n", | ||||||
|  |        "                       purchase_date  amount  is_full_price  \\\n", | ||||||
|  |        "49  2022-02-27 13:44:10.690000+00:00     0.0          False   \n", | ||||||
|  |        "117 2022-04-28 15:47:52.790000+00:00     0.0          False   \n", | ||||||
|  |        "274 2022-04-28 15:47:54.053000+00:00     0.0          False   \n", | ||||||
|  |        "304 2021-08-03 13:45:01.603000+00:00     0.0          False   \n", | ||||||
|  |        "311 2022-05-26 09:15:40.993000+00:00     0.0          False   \n", | ||||||
|  |        "\n", | ||||||
|  |        "      name_event_types  name_facilities  name_categories  \\\n", | ||||||
|  |        "49   ligue 1 uber eats  stade de l'aube    honneur basse   \n", | ||||||
|  |        "117  ligue 1 uber eats  stade de l'aube    honneur basse   \n", | ||||||
|  |        "274  ligue 1 uber eats  stade de l'aube    honneur basse   \n", | ||||||
|  |        "304  ligue 1 uber eats  stade de l'aube     vitoux haute   \n", | ||||||
|  |        "311  ligue 1 uber eats  stade de l'aube  champagne basse   \n", | ||||||
|  |        "\n", | ||||||
|  |        "                name_events      name_seasons  \n", | ||||||
|  |        "49   olympique de marseille  saison 2021-2022  \n", | ||||||
|  |        "117              ac ajaccio  saison 2022-2023  \n", | ||||||
|  |        "274           rc strasbourg  saison 2022-2023  \n", | ||||||
|  |        "304  olympique de marseille  saison 2021-2022  \n", | ||||||
|  |        "311       stade brestois 29  saison 2022-2023  " | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 145, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "#Filtre de la base df_products_purchased_reduced\n", | ||||||
|  |     "df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n", | ||||||
|  |     "df_products_purchased_reduced.head()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 150, | ||||||
|  |    "id": "ae7ef3a6-5b42-4a3c-a108-fec9f2ec4d32", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [ | ||||||
|  |     { | ||||||
|  |      "data": { | ||||||
|  |       "text/plain": [ | ||||||
|  |        "array(['caisse', 'adhésion', 'web [adhésion]', 'web [grand public]',\n", | ||||||
|  |        "       'itr ticketmaster', 'itr fnac', nan, 'decathlon', 'boutique web',\n", | ||||||
|  |        "       'boutique officielle'], dtype=object)" | ||||||
|  |       ] | ||||||
|  |      }, | ||||||
|  |      "execution_count": 150, | ||||||
|  |      "metadata": {}, | ||||||
|  |      "output_type": "execute_result" | ||||||
|  |     } | ||||||
|  |    ], | ||||||
|  |    "source": [ | ||||||
|  |     "df_products_purchased_reduced[\"supplier_name\"].unique()" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": 151, | ||||||
|  |    "id": "942f58a5-8ed4-4b18-a7a2-bd296447fa6a", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [ | ||||||
|  |     "# KPI sur le comportement d'achat\n", | ||||||
|  |     "tickets_information_copy = df_products_purchased_reduced.copy()\n", | ||||||
|  |     "# Dummy : Canal de vente en ligne\n", | ||||||
|  |     "liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n", | ||||||
|  |     "tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "markdown", | ||||||
|  |    "id": "658b57cd-4fb8-4552-a582-972144b2af1c", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "source": [ | ||||||
|  |     "tickets_information_copy['vente_internet'] corrected by handling na" | ||||||
|  |    ] | ||||||
|  |   }, | ||||||
|  |   { | ||||||
|  |    "cell_type": "code", | ||||||
|  |    "execution_count": null, | ||||||
|  |    "id": "f086a8dc-69ab-4cf3-b25e-379d7da02f43", | ||||||
|  |    "metadata": {}, | ||||||
|  |    "outputs": [], | ||||||
|  |    "source": [] | ||||||
|  |   } | ||||||
|  |  ], | ||||||
|  |  "metadata": { | ||||||
|  |   "kernelspec": { | ||||||
|  |    "display_name": "Python 3 (ipykernel)", | ||||||
|  |    "language": "python", | ||||||
|  |    "name": "python3" | ||||||
|  |   }, | ||||||
|  |   "language_info": { | ||||||
|  |    "codemirror_mode": { | ||||||
|  |     "name": "ipython", | ||||||
|  |     "version": 3 | ||||||
|  |    }, | ||||||
|  |    "file_extension": ".py", | ||||||
|  |    "mimetype": "text/x-python", | ||||||
|  |    "name": "python", | ||||||
|  |    "nbconvert_exporter": "python", | ||||||
|  |    "pygments_lexer": "ipython3", | ||||||
|  |    "version": "3.11.6" | ||||||
|  |   } | ||||||
|  |  }, | ||||||
|  |  "nbformat": 4, | ||||||
|  |  "nbformat_minor": 5 | ||||||
|  | } | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user