{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "bf34b03c-536f-4f93-93a5-e452552653aa", "metadata": {}, "outputs": [ { "name": "stdin", "output_type": "stream", "text": [ "Choisissez le type de compagnie : sport ? musique ? musee ? musique\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n", "Couverture Company 10 : 2016-03-07 - 2023-09-25\n", "File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n", "Couverture Company 11 : 2015-06-26 - 2023-11-08\n", "File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n", "Couverture Company 12 : 2016-06-14 - 2023-11-08\n", "File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n", "Couverture Company 13 : 2010-07-31 - 2023-11-08\n", "File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n", "Couverture Company 14 : 1901-01-01 - 2023-11-08\n", "File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset test : SUCCESS\n", "File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset train : SUCCESS\n", "File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset test : SUCCESS\n", "File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset train : SUCCESS\n", "File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset test : SUCCESS\n", "File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset train : SUCCESS\n", "File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset test : SUCCESS\n", "File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset train : SUCCESS\n", "File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset test : SUCCESS\n", "File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n", "Data filtering : SUCCESS\n", "KPIs construction : SUCCESS\n", "Explanatory variable construction : SUCCESS\n", "Explained variable construction : SUCCESS\n", "Exportation dataset train : SUCCESS\n", "FIN DE LA GENERATION DES DATASETS : SUCCESS\n" ] } ], "source": [ "# Business Data Challenge - Team 1\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "import warnings\n", "from datetime import date, timedelta, datetime\n", "\n", "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "\n", "# Import KPI construction functions\n", "exec(open('0_KPI_functions.py').read())\n", "\n", "# Ignore warning\n", "warnings.filterwarnings('ignore')\n", "\n", "\n", "def display_covering_time(df, company, datecover):\n", " \"\"\"\n", " This function draws the time coverage of each company\n", " \"\"\"\n", " min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n", " max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n", " datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n", " print(f'Couverture Company {company} : {min_date} - {max_date}')\n", " return datecover\n", "\n", "\n", "def compute_time_intersection(datecover):\n", " \"\"\"\n", " This function returns the time coverage for all companies\n", " \"\"\"\n", " timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n", " intersection = set.intersection(*timestamps_sets)\n", " intersection_list = list(intersection)\n", " formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n", " return sorted(formated_dates)\n", "\n", "\n", "def df_coverage_modelization(sport, coverage_train = 0.7):\n", " \"\"\"\n", " This function returns start_date, end_of_features and final dates\n", " that help to construct train and test datasets\n", " \"\"\"\n", " datecover = {}\n", " for company in sport:\n", " df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n", " datetime_col = ['purchase_date'])\n", " datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n", " #print(datecover.keys())\n", " dt_coverage = compute_time_intersection(datecover)\n", " start_date = dt_coverage[0]\n", " end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n", " final_date = dt_coverage[-1]\n", " return start_date, end_of_features, final_date\n", " \n", "\n", "def dataset_construction(min_date, end_features_date, max_date, directory_path):\n", " \n", " # Import customerplus\n", " df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n", " df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n", " df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n", " \n", " # Filtre de cohérence pour la mise en pratique de notre méthode\n", " max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') \n", " end_features_date = pd.to_datetime(end_features_date, utc = True, format = 'ISO8601')\n", " min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601')\n", "\n", " #Filtre de la base df_campaigns_information\n", " df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n", " df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= end_features_date] = np.datetime64('NaT')\n", " \n", " #Filtre de la base df_products_purchased_reduced\n", " df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n", "\n", " print(\"Data filtering : SUCCESS\")\n", " \n", " # Fusion de l'ensemble et creation des KPI\n", "\n", " # KPI sur les campagnes publicitaires\n", " df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n", "\n", " # KPI sur le comportement d'achat\n", " df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n", "\n", " # KPI sur les données socio-démographiques\n", " df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n", " \n", " print(\"KPIs construction : SUCCESS\")\n", " \n", " # Fusion avec KPI liés au customer\n", " df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left')\n", " \n", " # Fill NaN values\n", " df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0)\n", " \n", " # Fusion avec KPI liés au comportement d'achat\n", " df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer')\n", " \n", " # Fill NaN values\n", " df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0)\n", "\n", " print(\"Explanatory variable construction : SUCCESS\")\n", "\n", " # 2. Construction of the explained variable \n", " df_products_purchased_to_predict = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > end_features_date)]\n", "\n", " # Indicatrice d'achat\n", " df_products_purchased_to_predict['y_has_purchased'] = 1\n", "\n", " y = df_products_purchased_to_predict[['customer_id', 'y_has_purchased']].drop_duplicates()\n", "\n", " print(\"Explained variable construction : SUCCESS\")\n", " \n", " # 3. Merge between explained and explanatory variables\n", " dataset = pd.merge(df_customer_product, y, on = ['customer_id'], how = 'left')\n", "\n", " # 0 if there is no purchase\n", " dataset[['y_has_purchased']].fillna(0)\n", "\n", " # add id_company prefix to customer_id\n", " dataset['customer_id'] = directory_path + '_' + dataset['customer_id'].astype('str')\n", " \n", " return dataset\n", "\n", "## Exportation\n", "\n", "companies = {'musee' : ['1', '2', '3', '4', '101'],\n", " 'sport': ['5', '6', '7', '8', '9'],\n", " 'musique' : ['10', '11', '12', '13', '14']}\n", "\n", "type_of_comp = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n", "list_of_comp = companies[type_of_comp] \n", "# Dossier d'exportation\n", "BUCKET_OUT = f'projet-bdc2324-team1/Generalization/{type_of_comp}'\n", "\n", "# Create test dataset and train dataset for sport companies\n", "\n", "start_date, end_of_features, final_date = df_coverage_modelization(list_of_comp, coverage_train = 0.7)\n", "\n", "for company in list_of_comp:\n", " dataset_test = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n", " max_date = final_date, directory_path = company) \n", "\n", " # Exportation\n", " FILE_KEY_OUT_S3 = \"dataset_test\" + company + \".csv\"\n", " FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Test_set/\" + FILE_KEY_OUT_S3\n", " \n", " with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", " dataset_test.to_csv(file_out, index = False)\n", " \n", " print(\"Exportation dataset test : SUCCESS\")\n", "\n", "# Dataset train\n", " dataset_train = dataset_construction(min_date = start_date, end_features_date = end_of_features,\n", " max_date = final_date, directory_path = company)\n", " # Export\n", " FILE_KEY_OUT_S3 = \"dataset_train\" + company + \".csv\" \n", " FILE_PATH_OUT_S3 = BUCKET_OUT + \"/Train_test/\" + FILE_KEY_OUT_S3\n", " \n", " with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", " dataset_train.to_csv(file_out, index = False)\n", " \n", " print(\"Exportation dataset train : SUCCESS\")\n", "\n", "\n", "print(\"FIN DE LA GENERATION DES DATASETS : SUCCESS\")\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "3721427e-5957-4556-b278-2e7ffca892f4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'projet-bdc2324-team1/Generalization/musique/Train_test/dataset_train14.csv'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "FILE_PATH_OUT_S3" ] }, { "cell_type": "code", "execution_count": 10, "id": "f8546992-f425-4d1e-ad75-ad26a8052a18", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'projet' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[10], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mprojet\u001b[49m\u001b[38;5;241m-\u001b[39mbdc2324\u001b[38;5;241m-\u001b[39mteam1\u001b[38;5;241m/\u001b[39mGeneralization\u001b[38;5;241m/\u001b[39mmusique\u001b[38;5;241m/\u001b[39mTrain_test\n", "\u001b[0;31mNameError\u001b[0m: name 'projet' is not defined" ] } ], "source": [ "projet-bdc2324-team1/Generalization/musique/Train_test" ] }, { "cell_type": "code", "execution_count": 12, "id": "0dd34710-6da2-4438-9e1d-0ac092c1d28c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(343126, 41)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset_train.shape" ] }, { "cell_type": "code", "execution_count": 9, "id": "a3bfeeb6-2db0-4f1d-866c-8721343e97c5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "customer_id 0.000000\n", "nb_tickets 0.000000\n", "nb_purchases 0.000000\n", "total_amount 0.000000\n", "nb_suppliers 0.000000\n", "vente_internet_max 0.000000\n", "purchase_date_min 0.858950\n", "purchase_date_max 0.858950\n", "time_between_purchase 0.858950\n", "nb_tickets_internet 0.000000\n", "street_id 0.000000\n", "structure_id 0.869838\n", "mcp_contact_id 0.276677\n", "fidelity 0.000000\n", "tenant_id 0.000000\n", "is_partner 0.000000\n", "deleted_at 1.000000\n", "gender 0.000000\n", "is_email_true 0.000000\n", "opt_in 0.000000\n", "last_buying_date 0.709626\n", "max_price 0.709626\n", "ticket_sum 0.000000\n", "average_price 0.709626\n", "average_purchase_delay 0.709731\n", "average_price_basket 0.709731\n", "average_ticket_basket 0.709731\n", "total_price 0.000000\n", "purchase_count 0.000000\n", "first_buying_date 0.709626\n", "country 0.152090\n", "gender_label 0.000000\n", "gender_female 0.000000\n", "gender_male 0.000000\n", "gender_other 0.000000\n", "country_fr 0.152090\n", "has_tags 0.000000\n", "nb_campaigns 0.000000\n", "nb_campaigns_opened 0.000000\n", "time_to_open 0.848079\n", "y_has_purchased 1.000000\n", "dtype: float64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ " dataset_train.isna().sum()/dataset_train.shape[0]" ] }, { "cell_type": "code", "execution_count": 8, "id": "75f9a672-641f-49a2-a8d6-7673845506f5", "metadata": {}, "outputs": [], "source": [ "#Creation de la variable dependante fictive: 1 si l'individu a effectué un achat au cours de la periode de train et 0 sinon\n", "\n", "dataset_train_modif=dataset_train\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c121c1e2-d8e4-4b93-a882-9385581b63c9", "metadata": {}, "outputs": [], "source": [ "dataset_train_modif[\"" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }