From 71c5d86679fcd2991af92243151031ce5a878524 Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 22 Feb 2024 14:56:00 +0000 Subject: [PATCH 1/3] handle na for supplier --- 0_KPI_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/0_KPI_functions.py b/0_KPI_functions.py index 5ba11b9..6d0fcbb 100644 --- a/0_KPI_functions.py +++ b/0_KPI_functions.py @@ -46,7 +46,7 @@ def tickets_kpi_function(tickets_information = None): # Dummy : Canal de vente en ligne liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance - tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].str.contains('|'.join(liste_mots), case=False).astype(int) + tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int) # Proportion de vente en ligne prop_vente_internet = tickets_information_copy[tickets_information_copy['vente_internet'] == 1].groupby(['customer_id'])['ticket_id'].count().reset_index() From 44fef6d6189856fb75f1ab394c5f48f454d16984 Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 22 Feb 2024 14:56:54 +0000 Subject: [PATCH 2/3] investigate sport companies --- Sport/exploration_sport.ipynb | 854 ++++++++++++++++++++++++++++++++++ 1 file changed, 854 insertions(+) create mode 100644 Sport/exploration_sport.ipynb diff --git a/Sport/exploration_sport.ipynb b/Sport/exploration_sport.ipynb new file mode 100644 index 0000000..e28c5f2 --- /dev/null +++ b/Sport/exploration_sport.ipynb @@ -0,0 +1,854 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 114, + "id": "314bf34b-1f6d-4a99-8f82-aa71ebacdabc", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "import s3fs\n", + "import warnings\n", + "from datetime import date, timedelta, datetime\n", + "import numpy as np\n", + "\n", + "exec(open('../0_KPI_functions.py').read())" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "a276822a-c389-429e-b249-8a9e47758bfc", + "metadata": {}, + "outputs": [], + "source": [ + "# Ignore warning\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "f62b996c-4e17-40ea-83ba-f0cb60be7671", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1',\n", + " 'bdc2324-data/10',\n", + " 'bdc2324-data/101',\n", + " 'bdc2324-data/11',\n", + " 'bdc2324-data/12',\n", + " 'bdc2324-data/13',\n", + " 'bdc2324-data/14',\n", + " 'bdc2324-data/2',\n", + " 'bdc2324-data/3',\n", + " 'bdc2324-data/4',\n", + " 'bdc2324-data/5',\n", + " 'bdc2324-data/6',\n", + " 'bdc2324-data/7',\n", + " 'bdc2324-data/8',\n", + " 'bdc2324-data/9']" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", + "\n", + "BUCKET = \"bdc2324-data\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "markdown", + "id": "2c829aa8-2006-4e72-889b-7096dd55718b", + "metadata": {}, + "source": [ + "## Look at the time sequence of each company and compute inter time coverage" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "e86864b7-4852-449a-8680-638559d56080", + "metadata": {}, + "outputs": [], + "source": [ + "sport = ['5', '6', '7', '8', '9']" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "7634ec57-4891-4684-8638-1e1643baca28", + "metadata": {}, + "outputs": [], + "source": [ + "def display_covering_time(df, company, datecover):\n", + " \"\"\"\n", + " This function draws the time coverage of each company\n", + " \"\"\"\n", + " min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n", + " max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n", + " datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n", + " print(f'Couverture Company {company} : {min_date} - {max_date}')\n", + " return datecover" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "53c83f51-822c-4e05-8c7c-89aa327603c6", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_time_intersection(datecover):\n", + " timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n", + " intersection = set.intersection(*timestamps_sets)\n", + " intersection_list = list(intersection)\n", + " formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n", + " return sorted(formated_dates)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "eec152de-078e-44c4-ad6e-74ae6ba5c65a", + "metadata": {}, + "outputs": [], + "source": [ + "def df_coverage_modelization(sport, coverage_train = 0.7):\n", + " \"\"\"\n", + " This function returns start_date, end_of_features and final dates\n", + " that help to construct train and test datasets\n", + " \"\"\"\n", + " datecover = {}\n", + " for company in sport:\n", + " df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n", + " datetime_col = ['purchase_date'])\n", + " datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n", + " #print(datecover.keys())\n", + " dt_coverage = compute_time_intersection(datecover)\n", + " start_date = dt_coverage[0]\n", + " end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n", + " final_date = dt_coverage[-1]\n", + " return start_date, end_of_features, final_date\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "348f246a-bc2d-4bbc-ba05-aa825da15a69", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_5/products_purchased_reduced.csv\n", + "Couverture Company 5 : 2019-04-15 - 2023-11-09\n", + "File path : projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n", + "Couverture Company 6 : 2018-06-28 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_7/products_purchased_reduced.csv\n", + "Couverture Company 7 : 2015-02-10 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_8/products_purchased_reduced.csv\n", + "Couverture Company 8 : 2010-09-28 - 2023-11-08\n", + "File path : projet-bdc2324-team1/0_Input/Company_9/products_purchased_reduced.csv\n", + "Couverture Company 9 : 2014-09-22 - 2023-10-24\n", + "dict_keys(['5', '6', '7', '8', '9'])\n", + "2019-04-15 2022-06-15 2023-10-23\n" + ] + } + ], + "source": [ + "start_date, end_of_features, final_date = df_coverage_modelization(sport, coverage_train = 0.7)\n", + "print(start_date, end_of_features, final_date )" + ] + }, + { + "cell_type": "markdown", + "id": "34ddc267-4daa-4926-9d54-5b13d4212eaa", + "metadata": {}, + "source": [ + "## Look at common database between Sport companies" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "389387fa-2046-4811-b8dd-6d524e91fe2e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/5',\n", + " 'bdc2324-data/6',\n", + " 'bdc2324-data/7',\n", + " 'bdc2324-data/8',\n", + " 'bdc2324-data/9']" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "companies = fs.ls(BUCKET)\n", + "companies = [company for company in companies if any(company.endswith(end) for end in sport)]\n", + "companies" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "895fc2b3-c768-454d-bedb-54994e4d211a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of databases : 30\n", + "Number of common databases : 23\n" + ] + } + ], + "source": [ + "companies_database = {}\n", + "\n", + "for company in companies:\n", + " companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n", + "\n", + "all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n", + "print(\"Number of databases : \",len(all_database))\n", + "\n", + "data_in_common = set(all_database)\n", + "\n", + "for key in companies_database:\n", + " diff_database = data_in_common.symmetric_difference(companies_database[key])\n", + " data_in_common = data_in_common - diff_database\n", + "\n", + "print(\"Number of common databases : \",len(data_in_common))" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "0c06517d-f5b7-4104-94fa-0e3f843c5881", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'campaign_stats.csv',\n", + " 'campaigns.csv',\n", + " 'categories.csv',\n", + " 'countries.csv',\n", + " 'currencies.csv',\n", + " 'customer_target_mappings.csv',\n", + " 'customersplus.csv',\n", + " 'event_types.csv',\n", + " 'events.csv',\n", + " 'facilities.csv',\n", + " 'link_stats.csv',\n", + " 'pricing_formulas.csv',\n", + " 'product_packs.csv',\n", + " 'products.csv',\n", + " 'products_groups.csv',\n", + " 'purchases.csv',\n", + " 'representation_category_capacities.csv',\n", + " 'representations.csv',\n", + " 'seasons.csv',\n", + " 'suppliers.csv',\n", + " 'target_types.csv',\n", + " 'targets.csv',\n", + " 'tickets.csv'}" + ] + }, + "execution_count": 121, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_in_common" + ] + }, + { + "cell_type": "markdown", + "id": "1af245aa-44a7-453b-90f9-0c4bcc415cd0", + "metadata": {}, + "source": [ + "## Investigate errors from data construction for company 6" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "538a5ca2-a50d-4726-93eb-c2b0d0ab8400", + "metadata": {}, + "outputs": [], + "source": [ + "directory_path = '6'" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "1ca3fb71-930a-441c-b35b-b98bca780606", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_6/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n" + ] + } + ], + "source": [ + "df_customerplus_clean = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n", + "df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n", + "df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "2ad3052c-e9e6-4ef9-abe2-4b8b2306a2b9", + "metadata": {}, + "outputs": [], + "source": [ + "max_date = pd.to_datetime(final_date, utc = True, format = 'ISO8601') \n", + "end_features_date = pd.to_datetime(end_of_features, utc = True, format = 'ISO8601')\n", + "min_date = pd.to_datetime(start_date, utc = True, format = 'ISO8601')" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "146999f2-ab92-4b7c-8c57-2e3ac8c4dd88", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n" + ] + } + ], + "source": [ + "df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "7448a7b9-3edf-4177-9df2-a260ebbee45e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2022-06-15 00:00:00+0000', tz='UTC')" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "end_features_date" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "d8e954ab-65d4-4f36-8410-69bf664773a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape campaigns_information : (1333010, 8)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
0138NaT2022-08-02 18:31:33+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
1226135NaT2022-08-02 18:31:34+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
233876NaT2022-08-02 18:31:35+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
3426226NaT2022-08-02 18:31:35+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
4525349NaT2022-08-02 18:31:34+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
\n", + "
" + ], + "text/plain": [ + " id customer_id opened_at sent_at delivered_at \\\n", + "0 1 38 NaT 2022-08-02 18:31:33+00:00 NaN \n", + "1 2 26135 NaT 2022-08-02 18:31:34+00:00 NaN \n", + "2 3 3876 NaT 2022-08-02 18:31:35+00:00 NaN \n", + "3 4 26226 NaT 2022-08-02 18:31:35+00:00 NaN \n", + "4 5 25349 NaT 2022-08-02 18:31:34+00:00 NaN \n", + "\n", + " campaign_name campaign_service_id campaign_sent_at \n", + "0 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n", + "1 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n", + "2 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n", + "3 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n", + "4 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 " + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\"Shape campaigns_information : \", df_campaigns_information.shape)\n", + "df_campaigns_information.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "93eceaf1-ce4c-4dfa-9c51-4fd016d09fc5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2022-08-02 18:31:33+0000', tz='UTC')" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_campaigns_information['sent_at'].min()" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "id": "ea50cab4-1dae-4efe-ae3c-22b6f9ad1d26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2023-11-07 10:08:16+0000', tz='UTC')" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_campaigns_information['sent_at'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "dcb87bc9-caf5-4655-9cfa-4a3dad504bac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [id, customer_id, opened_at, sent_at, delivered_at, campaign_name, campaign_service_id, campaign_sent_at]\n", + "Index: []" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Filtre de la base df_campaigns_information\n", + "df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n", + "df_campaigns_information" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "abe22e09-a041-4349-be8f-b0784f2f0a98", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ticket_idcustomer_idpurchase_idevent_type_idsupplier_namepurchase_dateamountis_full_pricename_event_typesname_facilitiesname_categoriesname_eventsname_seasons
49914011083921259025.04caisse2022-02-27 13:44:10.690000+00:000.0Falseligue 1 uber eatsstade de l'aubehonneur basseolympique de marseillesaison 2021-2022
11753552731304136629.04adhésion2022-04-28 15:47:52.790000+00:000.0Falseligue 1 uber eatsstade de l'aubehonneur basseac ajacciosaison 2022-2023
274547400192140477.04adhésion2022-04-28 15:47:54.053000+00:000.0Falseligue 1 uber eatsstade de l'aubehonneur basserc strasbourgsaison 2022-2023
304844133138820259.04adhésion2021-08-03 13:45:01.603000+00:000.0Falseligue 1 uber eatsstade de l'aubevitoux hauteolympique de marseillesaison 2021-2022
311407271326590527.04web [adhésion]2022-05-26 09:15:40.993000+00:000.0Falseligue 1 uber eatsstade de l'aubechampagne bassestade brestois 29saison 2022-2023
\n", + "
" + ], + "text/plain": [ + " ticket_id customer_id purchase_id event_type_id supplier_name \\\n", + "49 91401 108392 1259025.0 4 caisse \n", + "117 535527 31304 136629.0 4 adhésion \n", + "274 547400 192 140477.0 4 adhésion \n", + "304 84413 31388 20259.0 4 adhésion \n", + "311 407271 3265 90527.0 4 web [adhésion] \n", + "\n", + " purchase_date amount is_full_price \\\n", + "49 2022-02-27 13:44:10.690000+00:00 0.0 False \n", + "117 2022-04-28 15:47:52.790000+00:00 0.0 False \n", + "274 2022-04-28 15:47:54.053000+00:00 0.0 False \n", + "304 2021-08-03 13:45:01.603000+00:00 0.0 False \n", + "311 2022-05-26 09:15:40.993000+00:00 0.0 False \n", + "\n", + " name_event_types name_facilities name_categories \\\n", + "49 ligue 1 uber eats stade de l'aube honneur basse \n", + "117 ligue 1 uber eats stade de l'aube honneur basse \n", + "274 ligue 1 uber eats stade de l'aube honneur basse \n", + "304 ligue 1 uber eats stade de l'aube vitoux haute \n", + "311 ligue 1 uber eats stade de l'aube champagne basse \n", + "\n", + " name_events name_seasons \n", + "49 olympique de marseille saison 2021-2022 \n", + "117 ac ajaccio saison 2022-2023 \n", + "274 rc strasbourg saison 2022-2023 \n", + "304 olympique de marseille saison 2021-2022 \n", + "311 stade brestois 29 saison 2022-2023 " + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Filtre de la base df_products_purchased_reduced\n", + "df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n", + "df_products_purchased_reduced.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "id": "ae7ef3a6-5b42-4a3c-a108-fec9f2ec4d32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['caisse', 'adhésion', 'web [adhésion]', 'web [grand public]',\n", + " 'itr ticketmaster', 'itr fnac', nan, 'decathlon', 'boutique web',\n", + " 'boutique officielle'], dtype=object)" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_products_purchased_reduced[\"supplier_name\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "id": "942f58a5-8ed4-4b18-a7a2-bd296447fa6a", + "metadata": {}, + "outputs": [], + "source": [ + "# KPI sur le comportement d'achat\n", + "tickets_information_copy = df_products_purchased_reduced.copy()\n", + "# Dummy : Canal de vente en ligne\n", + "liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n", + "tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)" + ] + }, + { + "cell_type": "markdown", + "id": "658b57cd-4fb8-4552-a582-972144b2af1c", + "metadata": {}, + "source": [ + "tickets_information_copy['vente_internet'] corrected by handling na" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f086a8dc-69ab-4cf3-b25e-379d7da02f43", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 79dc4f13ff2302d825f345afef01daaf1e0a9cdb Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 22 Feb 2024 14:57:34 +0000 Subject: [PATCH 3/3] generate train and test dataset for all companies --- 0_2_Dataset_construction.py | 89 +++++++++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 18 deletions(-) diff --git a/0_2_Dataset_construction.py b/0_2_Dataset_construction.py index f88952b..a27b08b 100644 --- a/0_2_Dataset_construction.py +++ b/0_2_Dataset_construction.py @@ -6,6 +6,7 @@ import os import s3fs import re import warnings +from datetime import date, timedelta, datetime # Create filesystem object S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] @@ -18,6 +19,47 @@ exec(open('0_KPI_functions.py').read()) # Ignore warning warnings.filterwarnings('ignore') + +def display_covering_time(df, company, datecover): + """ + This function draws the time coverage of each company + """ + min_date = df['purchase_date'].min().strftime("%Y-%m-%d") + max_date = df['purchase_date'].max().strftime("%Y-%m-%d") + datecover[company] = [datetime.strptime(min_date, "%Y-%m-%d") + timedelta(days=x) for x in range((datetime.strptime(max_date, "%Y-%m-%d") - datetime.strptime(min_date, "%Y-%m-%d")).days)] + print(f'Couverture Company {company} : {min_date} - {max_date}') + return datecover + + +def compute_time_intersection(datecover): + """ + This function returns the time coverage for all companies + """ + timestamps_sets = [set(timestamps) for timestamps in datecover.values()] + intersection = set.intersection(*timestamps_sets) + intersection_list = list(intersection) + formated_dates = [dt.strftime("%Y-%m-%d") for dt in intersection_list] + return sorted(formated_dates) + + +def df_coverage_modelization(sport, coverage_train = 0.7): + """ + This function returns start_date, end_of_features and final dates + that help to construct train and test datasets + """ + datecover = {} + for company in sport: + df_products_purchased_reduced = display_databases(company, file_name = "products_purchased_reduced", + datetime_col = ['purchase_date']) + datecover = display_covering_time(df_products_purchased_reduced, company, datecover) + #print(datecover.keys()) + dt_coverage = compute_time_intersection(datecover) + start_date = dt_coverage[0] + end_of_features = dt_coverage[int(0.7 * len(dt_coverage))] + final_date = dt_coverage[-1] + return start_date, end_of_features, final_date + + def dataset_construction(min_date, end_features_date, max_date, directory_path): # Import customerplus @@ -97,32 +139,43 @@ def dataset_construction(min_date, end_features_date, max_date, directory_path): ## Exportation +companies = {'musee' : ['1', '2', '3', '4', '101'], + 'sport': ['5', '6', '7', '8', '9'], + 'musique' : ['10', '11', '12', '13', '14']} + +type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?') +list_of_comp = companies[type_of_comp] # Dossier d'exportation -BUCKET_OUT = "projet-bdc2324-team1/2_Output/Logistique Regression databases - First approach" +BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}' -# Dataset test -dataset_test = dataset_construction(min_date = "2021-08-01", end_features_date = "2023-08-01", max_date = "2023-11-01", directory_path = "1") +# Create test dataset and train dataset for sport companies -# Exportation -FILE_KEY_OUT_S3 = "dataset_test.csv" -FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 +start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7) -with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: - dataset_test.to_csv(file_out, index = False) +for company in list_of_comp: + dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features, + max_date = final_date, directory_path = company) -print("Exportation dataset test : SUCCESS") + # Exportation + FILE_KEY_OUT_S3 = "dataset_test" + company + ".csv" + FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 + + with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: + dataset_test.to_csv(file_out, index = False) + + print("Exportation dataset test : SUCCESS") # Dataset train -dataset_train = dataset_construction(min_date = "2021-05-01", end_features_date = "2023-05-01", max_date = "2023-08-01", directory_path = "1") - -# Export -FILE_KEY_OUT_S3 = "dataset_train.csv" -FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 - -with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: - dataset_train.to_csv(file_out, index = False) + dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features, + max_date = final_date, directory_path = company) + # Export + FILE_KEY_OUT_S3 = "dataset_train" + company + ".csv" + FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 -print("Exportation dataset train : SUCCESS") + with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: + dataset_train.to_csv(file_out, index = False) + + print("Exportation dataset train : SUCCESS") print("FIN DE LA GENERATION DES DATASETS : SUCCESS")