{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "dd143b00-1989-44cf-8558-a30087d17f70", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import s3fs\n", "import warnings\n", "from datetime import date, timedelta, datetime\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "id": "08c63120-1b56-4145-9014-18a637b22876", "metadata": {}, "outputs": [], "source": [ "exec(open('../../0_KPI_functions.py').read())" ] }, { "cell_type": "code", "execution_count": 3, "id": "f8bd679d-fa76-49d4-9ec1-9f15516f16d3", "metadata": {}, "outputs": [], "source": [ "# Ignore warning\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "id": "ec9e996d-3eae-4836-8cf5-268e5dc0d672", "metadata": {}, "source": [ "# Statistiques descriptives : compagnies sport" ] }, { "cell_type": "markdown", "id": "43f81515-fbd0-49c0-b3f8-0e0fb663e2c1", "metadata": {}, "source": [ "## Importations et chargement des données" ] }, { "cell_type": "code", "execution_count": 7, "id": "945c59bb-05b4-4f21-82f0-0db40d7957b3", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 5, "id": "41a67995-0a08-45c0-bbad-6e6cee5474c8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_5/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_5/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_5/products_purchased_reduced.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_5/target_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_6/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_6/target_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_7/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_7/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_7/products_purchased_reduced.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_7/target_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_8/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_8/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_8/products_purchased_reduced.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_8/target_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_9/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_9/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_9/products_purchased_reduced.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_9/target_information.csv\n" ] } ], "source": [ "# création des bases contenant les KPI pour les 5 compagnies de spectacle\n", "\n", "# liste des compagnies de spectacle\n", "nb_compagnie=['5','6','7','8','9']\n", "\n", "customer_sport = pd.DataFrame()\n", "campaigns_sport = pd.DataFrame()\n", "products_sport = pd.DataFrame()\n", "tickets_sport = pd.DataFrame()\n", "\n", "# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle\n", "for directory_path in nb_compagnie:\n", " df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n", " df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n", " df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n", " df_target_information = display_databases(directory_path, file_name = \"target_information\")\n", " \n", " df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n", " df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n", " df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n", "\n", " \n", "# creation de la colonne Number compagnie, qui permettra d'agréger les résultats\n", " df_tickets_kpi[\"number_company\"]=int(directory_path)\n", " df_campaigns_kpi[\"number_company\"]=int(directory_path)\n", " df_customerplus_clean[\"number_company\"]=int(directory_path)\n", " df_target_information[\"number_company\"]=int(directory_path)\n", "\n", "# Traitement des index\n", " df_tickets_kpi[\"customer_id\"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')\n", " df_campaigns_kpi[\"customer_id\"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str') \n", " df_customerplus_clean[\"customer_id\"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str') \n", " df_products_purchased_reduced[\"customer_id\"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str') \n", "\n", "# Concaténation\n", " customer_sport = pd.concat([customer_sport, df_customerplus_clean], ignore_index=True)\n", " campaigns_sport = pd.concat([campaigns_sport, df_campaigns_kpi], ignore_index=True)\n", " tickets_sport = pd.concat([tickets_sport, df_tickets_kpi], ignore_index=True)\n", " products_sport = pd.concat([products_sport, df_products_purchased_reduced], ignore_index=True)\n", " " ] }, { "cell_type": "markdown", "id": "62922029-8071-402e-8115-c145a2874a2f", "metadata": {}, "source": [ "## Statistiques descriptives" ] }, { "cell_type": "markdown", "id": "d347bca9-3041-4414-b18e-19b626998a3e", "metadata": {}, "source": [ "### 0. Détection du client anonyme (outlier) - utile pour la section 3" ] }, { "cell_type": "code", "execution_count": 6, "id": "c4d4b2ad-8a3c-477b-bc52-dd4860527bfe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5, 6, 7, 8, 9])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sport_comp = tickets_sport['number_company'].unique()\n", "sport_comp" ] }, { "cell_type": "code", "execution_count": 7, "id": "97a9e235-1c04-46bf-9f3c-5496e141cc40", "metadata": {}, "outputs": [], "source": [ "def outlier_detection(company_list, show_diagram=False):\n", "\n", " outlier_list = list()\n", " \n", " for company in company_list:\n", " total_amount_share = tickets_sport[tickets_sport['number_company']==company].groupby('customer_id')['total_amount'].sum().reset_index()\n", " total_amount_share['CA'] = total_amount_share['total_amount'].sum()\n", " total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']\n", " \n", " total_amount_share_index = total_amount_share.set_index('customer_id')\n", " df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)\n", " top = df_circulaire[:1]\n", " outlier_list.append(top.index[0])\n", " rest = df_circulaire[1:]\n", " \n", " # Calculez la somme du reste\n", " rest_sum = rest.sum()\n", " \n", " # Créez une nouvelle série avec les cinq plus grandes parts et 'Autre'\n", " new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])\n", " \n", " # Créez le graphique circulaire\n", " if show_diagram:\n", " plt.figure(figsize=(3, 3))\n", " plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)\n", " plt.axis('equal') # Assurez-vous que le graphique est un cercle\n", " plt.title(f'Répartition des montants totaux pour la compagnie {company}')\n", " plt.show()\n", " return outlier_list\n", " " ] }, { "cell_type": "code", "execution_count": 8, "id": "770cd3fc-bfe2-4a69-89bc-0eb946311130", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['5_191835', '6_591412', '7_49632', '8_1942', '9_19683']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "outlier_list = outlier_detection(sport_comp)\n", "outlier_list" ] }, { "cell_type": "code", "execution_count": 9, "id": "70b6e961-c303-465e-93f4-609721d38454", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Suppression Réussie\n" ] } ], "source": [ "# On filtre les outliers\n", "\n", "def remove_elements(lst, elements_to_remove):\n", " return ''.join([x for x in lst if x not in elements_to_remove])\n", " \n", "databases = [customer_sport, campaigns_sport, tickets_sport, products_sport]\n", "\n", "for dataset in databases:\n", " dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))\n", "\n", "# On test\n", "\n", "bool = '5_191835' in customer_sport['customer_id']\n", "if not bool:\n", " print(\"Suppression Réussie\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "b54b920a-7b46-490f-ba7e-d1859055a4e3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | customer_id | \n", "street_id | \n", "structure_id | \n", "mcp_contact_id | \n", "fidelity | \n", "tenant_id | \n", "is_partner | \n", "deleted_at | \n", "gender | \n", "is_email_true | \n", "... | \n", "total_price | \n", "purchase_count | \n", "first_buying_date | \n", "country | \n", "gender_label | \n", "gender_female | \n", "gender_male | \n", "gender_other | \n", "country_fr | \n", "number_company | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "5_6009745 | \n", "1372685 | \n", "NaN | \n", "NaN | \n", "0 | \n", "1771 | \n", "False | \n", "NaN | \n", "2 | \n", "True | \n", "... | \n", "0.0 | \n", "0 | \n", "NaN | \n", "af | \n", "other | \n", "0 | \n", "0 | \n", "1 | \n", "0.0 | \n", "5 | \n", "
1 | \n", "5_6011228 | \n", "1372685 | \n", "NaN | \n", "NaN | \n", "0 | \n", "1771 | \n", "False | \n", "NaN | \n", "2 | \n", "True | \n", "... | \n", "0.0 | \n", "0 | \n", "NaN | \n", "af | \n", "other | \n", "0 | \n", "0 | \n", "1 | \n", "0.0 | \n", "5 | \n", "
2 | \n", "5_6058950 | \n", "1372685 | \n", "NaN | \n", "NaN | \n", "0 | \n", "1771 | \n", "False | \n", "NaN | \n", "2 | \n", "True | \n", "... | \n", "0.0 | \n", "0 | \n", "NaN | \n", "af | \n", "other | \n", "0 | \n", "0 | \n", "1 | \n", "0.0 | \n", "5 | \n", "
3 | \n", "5_6062404 | \n", "1372685 | \n", "NaN | \n", "NaN | \n", "0 | \n", "1771 | \n", "False | \n", "NaN | \n", "2 | \n", "True | \n", "... | \n", "0.0 | \n", "0 | \n", "NaN | \n", "af | \n", "other | \n", "0 | \n", "0 | \n", "1 | \n", "0.0 | \n", "5 | \n", "
4 | \n", "5_250217 | \n", "78785 | \n", "NaN | \n", "11035.0 | \n", "0 | \n", "1771 | \n", "False | \n", "NaN | \n", "0 | \n", "True | \n", "... | \n", "NaN | \n", "0 | \n", "NaN | \n", "fr | \n", "female | \n", "1 | \n", "0 | \n", "0 | \n", "1.0 | \n", "5 | \n", "
5 rows × 28 columns
\n", "\n", " | customer_id | \n", "nb_tickets | \n", "nb_purchases | \n", "total_amount | \n", "nb_suppliers | \n", "vente_internet_max | \n", "purchase_date_min | \n", "purchase_date_max | \n", "time_between_purchase | \n", "nb_tickets_internet | \n", "... | \n", "gender_label | \n", "gender_female | \n", "gender_male | \n", "gender_other | \n", "country_fr | \n", "has_tags | \n", "nb_campaigns | \n", "nb_campaigns_opened | \n", "time_to_open | \n", "y_has_purchased | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "10_1 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "other | \n", "0 | \n", "0 | \n", "1 | \n", "NaN | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
1 | \n", "10_2 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "other | \n", "0 | \n", "0 | \n", "1 | \n", "NaN | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
2 | \n", "10_3 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "other | \n", "0 | \n", "0 | \n", "1 | \n", "NaN | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
3 | \n", "10_4 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "other | \n", "0 | \n", "0 | \n", "1 | \n", "NaN | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
4 | \n", "10_5 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "other | \n", "0 | \n", "0 | \n", "1 | \n", "NaN | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1523683 | \n", "14_6884748 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "male | \n", "0 | \n", "1 | \n", "0 | \n", "1.0 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
1523684 | \n", "14_6884749 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "male | \n", "0 | \n", "1 | \n", "0 | \n", "1.0 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
1523685 | \n", "14_6884750 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "male | \n", "0 | \n", "1 | \n", "0 | \n", "1.0 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
1523686 | \n", "14_6884751 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "female | \n", "1 | \n", "0 | \n", "0 | \n", "1.0 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
1523687 | \n", "14_6884753 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0.0 | \n", "... | \n", "male | \n", "0 | \n", "1 | \n", "0 | \n", "1.0 | \n", "0 | \n", "0.0 | \n", "0.0 | \n", "NaN | \n", "NaN | \n", "
1523688 rows × 41 columns
\n", "