diff --git a/Descriptive_statistics/debug.ipynb b/Descriptive_statistics/debug.ipynb new file mode 100644 index 0000000..e2f1385 --- /dev/null +++ b/Descriptive_statistics/debug.ipynb @@ -0,0 +1,103 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "718d4e6d-b90a-4955-90ee-c1518246c07c", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Choisissez le type de compagnie : sport ? musique ? musee ? sport\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_5/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_5/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_5/products_purchased_reduced.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_5/target_information.csv\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import s3fs\n", + "import re\n", + "import warnings\n", + "\n", + "# Ignore warning\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "exec(open('../0_KPI_functions.py').read())\n", + "exec(open('plot.py').read())\n", + "\n", + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", + "\n", + "companies = {'musee' : ['1', '2', '3', '4'], # , '101'\n", + " 'sport': ['5', '6'],\n", + " 'musique' : ['10', '11', '12', '13', '14']}\n", + "\n", + "\n", + "type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')\n", + "list_of_comp = companies[type_of_activity] \n", + "\n", + "# Load files\n", + "customer, campaigns_kpi, campaigns_brut, tickets, products = load_files(list_of_comp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b967f70a-e3ae-423e-9fb0-edfc00ddf826", + "metadata": {}, + "outputs": [], + "source": [ + "# Identify anonymous customer for each company and remove them from our datasets\n", + "outlier_list = outlier_detection(tickets, list_of_comp)\n", + "\n", + "# Identify valid customer (customer who bought tickets after starting date or received mails after starting date)\n", + "customer_valid_list = valid_customer_detection(products, campaigns_brut)\n", + "\n", + "# Identify customer who bought during the period of y\n", + "consumer_target_period = identify_purchase_during_target_periode(products)\n", + "\n", + "databases = [customer, campaigns_kpi, campaigns_brut, tickets, products]\n", + "\n", + "for dataset in databases:\n", + " dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))# remove outlier\n", + " dataset['customer_id'] = dataset['customer_id'].isin(customer_valid_list) # keep only valid customer\n", + " dataset['has_purchased_target_period'] = np.where(dataset['customer_id'].isin(customer_valid_list), 1, 0)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Descriptive_statistics/generate_stat_desc.py b/Descriptive_statistics/generate_stat_desc.py index a427d8d..43053ca 100644 --- a/Descriptive_statistics/generate_stat_desc.py +++ b/Descriptive_statistics/generate_stat_desc.py @@ -50,6 +50,8 @@ maximum_price_paid(customer, type_of_activity) mailing_consent(customer, type_of_activity) +mailing_consent_by_target(customer) + #gender_bar(customer, type_of_activity) #country_bar(customer, type_of_activity) diff --git a/Descriptive_statistics/plot.py b/Descriptive_statistics/plot.py index cd8f82a..741b539 100644 --- a/Descriptive_statistics/plot.py +++ b/Descriptive_statistics/plot.py @@ -70,7 +70,7 @@ def outlier_detection(tickets, company_list, show_diagram=False): total_amount_share_index = total_amount_share.set_index('customer_id') df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False) - print('df circulaire : ', df_circulaire.head()) + #print('df circulaire : ', df_circulaire.head()) top = df_circulaire[:1] print('top : ', top) outlier_list.append(top.index[0]) @@ -101,7 +101,7 @@ def valid_customer_detection(products, campaigns_brut): def identify_purchase_during_target_periode(products): - products_target_period = products[products['purchase_date']>="2022-11-01" & products['purchase_date']<="2023-11-01"] + products_target_period = products[(products['purchase_date']>="2022-11-01") & (products['purchase_date']<="2023-11-01")] consumer_target_period = products_target_period['customer_id'].to_list() return consumer_target_period @@ -140,13 +140,46 @@ def mailing_consent(customer, type_of_activity): plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"]) plt.xlabel('Company') - plt.ylabel('Consent of mailing (%)') + plt.ylabel('Company') plt.title(f'Consent of mailing for {type_of_activity}') plt.show() save_file_s3("mailing_consent_", type_of_activity) +def mailing_consent_by_target(customer): + df_graph = customer.groupby(["number_company", "has_purchased_target_period"])["opt_in"].mean().reset_index() + # Création du barplot groupé + fig, ax = plt.subplots(figsize=(10, 6)) + + categories = df_graph["number_company"].unique() + bar_width = 0.35 + bar_positions = np.arange(len(categories)) + + # Grouper les données par label et créer les barres groupées + for label in df_graph["has_purchased_target_period"].unique(): + label_data = df_graph[df_graph['has_purchased_target_period'] == label] + values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories] + + label_printed = "purchased" if label else "no purchase" + ax.bar(bar_positions, values, bar_width, label=label_printed) + + # Mise à jour des positions des barres pour le prochain groupe + bar_positions = [pos + bar_width for pos in bar_positions] + + # Ajout des étiquettes, de la légende, etc. + ax.set_xlabel('Company') + ax.set_ylabel('Company') + ax.set_title(f'Consent of mailing according to target for {type_of_activity}') + ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))]) + ax.set_xticklabels(categories) + ax.legend() + + # Affichage du plot + plt.show() + save_file_s3("mailing_consent_target_", type_of_activity) + + def gender_bar(customer, type_of_activity): company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()