diff --git a/Sport/Descriptive_statistics/generate_dataset_DS.py b/Sport/Descriptive_statistics/generate_dataset_DS.py deleted file mode 100644 index 889db77..0000000 --- a/Sport/Descriptive_statistics/generate_dataset_DS.py +++ /dev/null @@ -1,14 +0,0 @@ -import pandas as pd -import numpy as np -import os -import s3fs -import re -import warnings - -# Create filesystem object -S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] -fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) - -# Ignore warning -warnings.filterwarnings('ignore') - diff --git a/Sport/Descriptive_statistics/stat_desc_sport.ipynb b/Sport/Descriptive_statistics/stat_desc_sport.ipynb new file mode 100644 index 0000000..87ded22 --- /dev/null +++ b/Sport/Descriptive_statistics/stat_desc_sport.ipynb @@ -0,0 +1,1239 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "dd143b00-1989-44cf-8558-a30087d17f70", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "import s3fs\n", + "import warnings\n", + "from datetime import date, timedelta, datetime\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "08c63120-1b56-4145-9014-18a637b22876", + "metadata": {}, + "outputs": [], + "source": [ + "exec(open('../../0_KPI_functions.py').read())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f8bd679d-fa76-49d4-9ec1-9f15516f16d3", + "metadata": {}, + "outputs": [], + "source": [ + "# Ignore warning\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "ec9e996d-3eae-4836-8cf5-268e5dc0d672", + "metadata": {}, + "source": [ + "# Statistiques descriptives : compagnies sport" + ] + }, + { + "cell_type": "markdown", + "id": "43f81515-fbd0-49c0-b3f8-0e0fb663e2c1", + "metadata": {}, + "source": [ + "## Importations et chargement des données" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "945c59bb-05b4-4f21-82f0-0db40d7957b3", + "metadata": {}, + "outputs": [], + "source": [ + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "41a67995-0a08-45c0-bbad-6e6cee5474c8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File path : projet-bdc2324-team1/0_Input/Company_5/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_5/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_5/products_purchased_reduced.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_5/target_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_6/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_6/target_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_7/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_7/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_7/products_purchased_reduced.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_7/target_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_8/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_8/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_8/products_purchased_reduced.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_8/target_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_9/customerplus_cleaned.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_9/campaigns_information.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_9/products_purchased_reduced.csv\n", + "File path : projet-bdc2324-team1/0_Input/Company_9/target_information.csv\n" + ] + } + ], + "source": [ + "# création des bases contenant les KPI pour les 5 compagnies de spectacle\n", + "\n", + "# liste des compagnies de spectacle\n", + "nb_compagnie=['5','6','7','8','9']\n", + "\n", + "customer_sport = pd.DataFrame()\n", + "campaigns_sport = pd.DataFrame()\n", + "products_sport = pd.DataFrame()\n", + "tickets_sport = pd.DataFrame()\n", + "\n", + "# début de la boucle permettant de générer des datasets agrégés pour les 5 compagnies de spectacle\n", + "for directory_path in nb_compagnie:\n", + " df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n", + " df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n", + " df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n", + " df_target_information = display_databases(directory_path, file_name = \"target_information\")\n", + " \n", + " df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n", + " df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n", + " df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n", + "\n", + " \n", + "# creation de la colonne Number compagnie, qui permettra d'agréger les résultats\n", + " df_tickets_kpi[\"number_company\"]=int(directory_path)\n", + " df_campaigns_kpi[\"number_company\"]=int(directory_path)\n", + " df_customerplus_clean[\"number_company\"]=int(directory_path)\n", + " df_target_information[\"number_company\"]=int(directory_path)\n", + "\n", + "# Traitement des index\n", + " df_tickets_kpi[\"customer_id\"]= directory_path + '_' + df_tickets_kpi['customer_id'].astype('str')\n", + " df_campaigns_kpi[\"customer_id\"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str') \n", + " df_customerplus_clean[\"customer_id\"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str') \n", + " df_products_purchased_reduced[\"customer_id\"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str') \n", + "\n", + "# Concaténation\n", + " customer_sport = pd.concat([customer_sport, df_customerplus_clean], ignore_index=True)\n", + " campaigns_sport = pd.concat([campaigns_sport, df_campaigns_kpi], ignore_index=True)\n", + " tickets_sport = pd.concat([tickets_sport, df_tickets_kpi], ignore_index=True)\n", + " products_sport = pd.concat([products_sport, df_products_purchased_reduced], ignore_index=True)\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "62922029-8071-402e-8115-c145a2874a2f", + "metadata": {}, + "source": [ + "## Statistiques descriptives" + ] + }, + { + "cell_type": "markdown", + "id": "d347bca9-3041-4414-b18e-19b626998a3e", + "metadata": {}, + "source": [ + "### 0. Détection du client anonyme (outlier) - utile pour la section 3" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c4d4b2ad-8a3c-477b-bc52-dd4860527bfe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([5, 6, 7, 8, 9])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sport_comp = tickets_sport['number_company'].unique()\n", + "sport_comp" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "97a9e235-1c04-46bf-9f3c-5496e141cc40", + "metadata": {}, + "outputs": [], + "source": [ + "def outlier_detection(company_list, show_diagram=False):\n", + "\n", + " outlier_list = list()\n", + " \n", + " for company in company_list:\n", + " total_amount_share = tickets_sport[tickets_sport['number_company']==company].groupby('customer_id')['total_amount'].sum().reset_index()\n", + " total_amount_share['CA'] = total_amount_share['total_amount'].sum()\n", + " total_amount_share['share_total_amount'] = total_amount_share['total_amount']/total_amount_share['CA']\n", + " \n", + " total_amount_share_index = total_amount_share.set_index('customer_id')\n", + " df_circulaire = total_amount_share_index['total_amount'].sort_values(axis = 0, ascending = False)\n", + " top = df_circulaire[:1]\n", + " outlier_list.append(top.index[0])\n", + " rest = df_circulaire[1:]\n", + " \n", + " # Calculez la somme du reste\n", + " rest_sum = rest.sum()\n", + " \n", + " # Créez une nouvelle série avec les cinq plus grandes parts et 'Autre'\n", + " new_series = pd.concat([top, pd.Series([rest_sum], index=['Autre'])])\n", + " \n", + " # Créez le graphique circulaire\n", + " if show_diagram:\n", + " plt.figure(figsize=(3, 3))\n", + " plt.pie(new_series, labels=new_series.index, autopct='%1.1f%%', startangle=140, pctdistance=0.5)\n", + " plt.axis('equal') # Assurez-vous que le graphique est un cercle\n", + " plt.title(f'Répartition des montants totaux pour la compagnie {company}')\n", + " plt.show()\n", + " return outlier_list\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "770cd3fc-bfe2-4a69-89bc-0eb946311130", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['5_191835', '6_591412', '7_49632', '8_1942', '9_19683']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outlier_list = outlier_detection(sport_comp)\n", + "outlier_list" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "70b6e961-c303-465e-93f4-609721d38454", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Suppression Réussie\n" + ] + } + ], + "source": [ + "# On filtre les outliers\n", + "\n", + "def remove_elements(lst, elements_to_remove):\n", + " return [x for x in lst if x not in elements_to_remove]\n", + " \n", + "databases = [customer_sport, campaigns_sport, tickets_sport, products_sport]\n", + "\n", + "for dataset in databases:\n", + " dataset['customer_id'] = dataset['customer_id'].apply(lambda x: remove_elements(x, outlier_list))\n", + "\n", + "# On test\n", + "\n", + "bool = '5_191835' in customer_sport['customer_id']\n", + "if not bool:\n", + " print(\"Suppression Réussie\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b54b920a-7b46-490f-ba7e-d1859055a4e3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idstreet_idstructure_idmcp_contact_idfidelitytenant_idis_partnerdeleted_atgenderis_email_true...purchase_countfirst_buying_datecountrygender_labelgender_femalegender_malegender_othercountry_frhas_tagsnumber_company
0[5, _, 6, 0, 0, 9, 7, 4, 5]1372685NaNNaN01771FalseNaN2True...0NaNafother0010.005
1[5, _, 6, 0, 1, 1, 2, 2, 8]1372685NaNNaN01771FalseNaN2True...0NaNafother0010.005
2[5, _, 6, 0, 5, 8, 9, 5, 0]1372685NaNNaN01771FalseNaN2True...0NaNafother0010.005
3[5, _, 6, 0, 6, 2, 4, 0, 4]1372685NaNNaN01771FalseNaN2True...0NaNafother0010.005
4[5, _, 2, 5, 0, 2, 1, 7]78785NaN11035.001771FalseNaN0True...0NaNfrfemale1001.005
\n", + "

5 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " customer_id street_id structure_id mcp_contact_id \\\n", + "0 [5, _, 6, 0, 0, 9, 7, 4, 5] 1372685 NaN NaN \n", + "1 [5, _, 6, 0, 1, 1, 2, 2, 8] 1372685 NaN NaN \n", + "2 [5, _, 6, 0, 5, 8, 9, 5, 0] 1372685 NaN NaN \n", + "3 [5, _, 6, 0, 6, 2, 4, 0, 4] 1372685 NaN NaN \n", + "4 [5, _, 2, 5, 0, 2, 1, 7] 78785 NaN 11035.0 \n", + "\n", + " fidelity tenant_id is_partner deleted_at gender is_email_true ... \\\n", + "0 0 1771 False NaN 2 True ... \n", + "1 0 1771 False NaN 2 True ... \n", + "2 0 1771 False NaN 2 True ... \n", + "3 0 1771 False NaN 2 True ... \n", + "4 0 1771 False NaN 0 True ... \n", + "\n", + " purchase_count first_buying_date country gender_label gender_female \\\n", + "0 0 NaN af other 0 \n", + "1 0 NaN af other 0 \n", + "2 0 NaN af other 0 \n", + "3 0 NaN af other 0 \n", + "4 0 NaN fr female 1 \n", + "\n", + " gender_male gender_other country_fr has_tags number_company \n", + "0 0 1 0.0 0 5 \n", + "1 0 1 0.0 0 5 \n", + "2 0 1 0.0 0 5 \n", + "3 0 1 0.0 0 5 \n", + "4 0 0 1.0 0 5 \n", + "\n", + "[5 rows x 29 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_sport.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d40fe668-e1d7-4544-9db8-02498afe65fe", + "metadata": {}, + "source": [ + "### 1. customerplus_clean" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "eec1ac0b-2502-452b-97e6-69ffb77156d6", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_nb_clients(customer_sport):\n", + " company_nb_clients = customer_sport[customer_sport[\"purchase_count\"]>0].groupby(\"number_company\")[\"customer_id\"].count().reset_index()\n", + " plt.bar(company_nb_clients[\"number_company\"], company_nb_clients[\"customer_id\"]/1000)\n", + "\n", + " # Ajout de titres et d'étiquettes\n", + " plt.xlabel('Company')\n", + " plt.ylabel(\"Nombre de clients (milliers)\")\n", + " plt.title(\"Nombre de clients de chaque compagnie de spectacle\")\n", + " \n", + " # Affichage du barplot\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "db4494e7-6f65-4f7e-bf8c-8ec321d0b02d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "compute_nb_clients(customer_sport)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "a12a59a0-edfe-4e52-8037-9b875f823b33", + "metadata": {}, + "outputs": [], + "source": [ + "def maximum_price_paid(customer_sport):\n", + " company_max_price = customer_sport.groupby(\"number_company\")[\"max_price\"].max().reset_index()\n", + " # Création du barplot\n", + " plt.bar(company_max_price[\"number_company\"], company_max_price[\"max_price\"])\n", + " \n", + " # Ajout de titres et d'étiquettes\n", + " plt.xlabel('Company')\n", + " plt.ylabel(\"Prix maximal d'un billet vendu\")\n", + " plt.title(\"Prix maximal de vente observé par compagnie de spectacle\")\n", + " \n", + " # Affichage du barplot\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "2c7c2d26-4e35-4163-b771-fa4d3e8ca83e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "maximum_price_paid(customer_sport)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "597d4361-8beb-43f4-9224-8f7dc34b187c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Statistiques Descriptives company 5\n", + " average_price average_price_basket average_ticket_basket \\\n", + "count 145390.000000 68869.000000 68869.000000 \n", + "mean 11.070309 65.969693 3.655202 \n", + "std 16.353610 195.462869 13.119612 \n", + "min 0.000000 0.000000 1.000000 \n", + "25% 0.000000 20.000000 1.000000 \n", + "50% 0.000000 45.000000 2.000000 \n", + "75% 20.000000 79.500000 3.000000 \n", + "max 500.000000 24159.405000 2139.833333 \n", + "\n", + " purchase_count total_price \n", + "count 471598.00000 3.950770e+05 \n", + "mean 0.29900 2.608544e+01 \n", + "std 7.22753 2.089636e+03 \n", + "min 0.00000 0.000000e+00 \n", + "25% 0.00000 0.000000e+00 \n", + "50% 0.00000 0.000000e+00 \n", + "75% 0.00000 0.000000e+00 \n", + "max 3532.00000 1.262516e+06 \n", + "Statistiques Descriptives company 6\n", + " average_price average_price_basket average_ticket_basket \\\n", + "count 33779.000000 33779.000000 33779.000000 \n", + "mean 24.033859 56.711279 2.413530 \n", + "std 21.217031 72.841926 3.763809 \n", + "min -52.740000 -1046.666667 1.000000 \n", + "25% 10.000000 19.000000 1.080000 \n", + "50% 19.333333 39.000000 2.000000 \n", + "75% 30.000000 72.990000 3.000000 \n", + "max 199.990000 3922.845361 309.047619 \n", + "\n", + " purchase_count total_price \n", + "count 79938.000000 79938.000000 \n", + "mean 2.842090 102.251041 \n", + "std 74.949889 4290.159858 \n", + "min 0.000000 -3140.000000 \n", + "25% 0.000000 0.000000 \n", + "50% 0.000000 0.000000 \n", + "75% 1.000000 54.980000 \n", + "max 14750.000000 762695.290000 \n", + "Statistiques Descriptives company 7\n", + " average_price average_price_basket average_ticket_basket \\\n", + "count 39524.000000 39524.000000 39524.000000 \n", + "mean 33.110568 155.618778 3.365885 \n", + "std 85.221328 1085.613137 6.283143 \n", + "min 0.000000 0.000000 1.000000 \n", + "25% 17.250000 25.000000 1.800000 \n", + "50% 25.000000 57.676364 2.000000 \n", + "75% 43.054691 115.837500 3.555556 \n", + "max 10770.000000 86160.000000 400.000000 \n", + "\n", + " purchase_count total_price \n", + "count 68800.000000 68800.000000 \n", + "mean 3.290029 944.593729 \n", + "std 88.071870 12118.394731 \n", + "min 0.000000 0.000000 \n", + "25% 0.000000 0.000000 \n", + "50% 1.000000 9.000000 \n", + "75% 2.000000 132.000000 \n", + "max 22934.000000 940874.200000 \n", + "Statistiques Descriptives company 8\n", + " average_price average_price_basket average_ticket_basket \\\n", + "count 129198.000000 129198.000000 129198.000000 \n", + "mean 18.409977 38.492520 2.258036 \n", + "std 19.159059 71.136628 5.270858 \n", + "min -20.000000 -1545.000000 1.000000 \n", + "25% 0.000000 0.000000 1.000000 \n", + "50% 15.000000 20.000000 2.000000 \n", + "75% 28.461538 52.500000 2.000000 \n", + "max 390.000000 7618.227273 750.000000 \n", + "\n", + " purchase_count total_price \n", + "count 197376.000000 197376.000000 \n", + "mean 4.637448 130.336075 \n", + "std 96.228665 2791.899946 \n", + "min 0.000000 -36124.000000 \n", + "25% 0.000000 0.000000 \n", + "50% 1.000000 0.000000 \n", + "75% 2.000000 75.000000 \n", + "max 40272.000000 702080.290000 \n", + "Statistiques Descriptives company 9\n", + " average_price average_price_basket average_ticket_basket \\\n", + "count 102710.000000 102710.000000 102710.000000 \n", + "mean 60.312171 62.384177 1.042402 \n", + "std 50.018101 52.009984 0.268064 \n", + "min -291.670000 -291.670000 1.000000 \n", + "25% 41.500000 42.350000 1.000000 \n", + "50% 59.000000 61.070000 1.000000 \n", + "75% 74.550000 77.710000 1.000000 \n", + "max 1116.500000 1216.950000 23.000000 \n", + "\n", + " purchase_count total_price \n", + "count 181134.000000 181134.000000 \n", + "mean 1.021354 63.476966 \n", + "std 1.805412 129.781944 \n", + "min 0.000000 -291.670000 \n", + "25% 0.000000 0.000000 \n", + "50% 1.000000 0.000000 \n", + "75% 1.000000 80.000000 \n", + "max 273.000000 14343.950000 \n" + ] + } + ], + "source": [ + "for company in sport_comp:\n", + " print(f'Statistiques Descriptives company {company}')\n", + " company_data = customer_sport[customer_sport['number_company'] == company][['average_price', 'average_price_basket',\n", + " 'average_ticket_basket', 'purchase_count', 'total_price']]\n", + " print(company_data.describe())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5058d3c9-73a0-4e01-881e-4d2423f0d291", + "metadata": {}, + "outputs": [], + "source": [ + "customer_sport[\"already_purchased\"] = customer_sport[\"purchase_count\"] > 0" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "986a0e41-ae31-46c5-a009-861530d85f45", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idstreet_idstructure_idmcp_contact_idfidelitytenant_idis_partnerdeleted_atgenderis_email_true...purchase_countfirst_buying_datecountrygender_labelgender_femalegender_malegender_othercountry_frhas_tagsnumber_company
0[5, _, 6, 0, 0, 9, 7, 4, 5]1372685NaNNaN01771FalseNaN2True...0NaNafother0010.005
1[5, _, 6, 0, 1, 1, 2, 2, 8]1372685NaNNaN01771FalseNaN2True...0NaNafother0010.005
2[5, _, 6, 0, 5, 8, 9, 5, 0]1372685NaNNaN01771FalseNaN2True...0NaNafother0010.005
3[5, _, 6, 0, 6, 2, 4, 0, 4]1372685NaNNaN01771FalseNaN2True...0NaNafother0010.005
4[5, _, 2, 5, 0, 2, 1, 7]78785NaN11035.001771FalseNaN0True...0NaNfrfemale1001.005
..................................................................
998841[9, _, 9, 9, 5, 1, 4, 6]607676NaNNaN11490FalseNaN1True...12022-05-12 06:20:49+00:00NaNmale010NaN09
998842[9, _, 9, 7, 0, 8, 9, 1]587855NaNNaN11490FalseNaN1True...12022-05-03 04:20:43+00:00frmale0101.009
998843[9, _, 8, 4, 4, 3, 0, 2]484177NaNNaN11490FalseNaN1True...12022-03-27 12:15:02+00:00demale0100.009
998844[9, _, 9, 4, 1, 2, 6, 0]564032NaNNaN11490FalseNaN1True...12022-04-20 15:12:38+00:00chmale0100.009
998845[9, _, 8, 0, 9, 7, 4, 2]453747NaNNaN11490FalseNaN1True...12022-03-07 20:42:07+00:00frmale0101.009
\n", + "

998846 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " customer_id street_id structure_id mcp_contact_id \\\n", + "0 [5, _, 6, 0, 0, 9, 7, 4, 5] 1372685 NaN NaN \n", + "1 [5, _, 6, 0, 1, 1, 2, 2, 8] 1372685 NaN NaN \n", + "2 [5, _, 6, 0, 5, 8, 9, 5, 0] 1372685 NaN NaN \n", + "3 [5, _, 6, 0, 6, 2, 4, 0, 4] 1372685 NaN NaN \n", + "4 [5, _, 2, 5, 0, 2, 1, 7] 78785 NaN 11035.0 \n", + "... ... ... ... ... \n", + "998841 [9, _, 9, 9, 5, 1, 4, 6] 607676 NaN NaN \n", + "998842 [9, _, 9, 7, 0, 8, 9, 1] 587855 NaN NaN \n", + "998843 [9, _, 8, 4, 4, 3, 0, 2] 484177 NaN NaN \n", + "998844 [9, _, 9, 4, 1, 2, 6, 0] 564032 NaN NaN \n", + "998845 [9, _, 8, 0, 9, 7, 4, 2] 453747 NaN NaN \n", + "\n", + " fidelity tenant_id is_partner deleted_at gender is_email_true \\\n", + "0 0 1771 False NaN 2 True \n", + "1 0 1771 False NaN 2 True \n", + "2 0 1771 False NaN 2 True \n", + "3 0 1771 False NaN 2 True \n", + "4 0 1771 False NaN 0 True \n", + "... ... ... ... ... ... ... \n", + "998841 1 1490 False NaN 1 True \n", + "998842 1 1490 False NaN 1 True \n", + "998843 1 1490 False NaN 1 True \n", + "998844 1 1490 False NaN 1 True \n", + "998845 1 1490 False NaN 1 True \n", + "\n", + " ... purchase_count first_buying_date country gender_label \\\n", + "0 ... 0 NaN af other \n", + "1 ... 0 NaN af other \n", + "2 ... 0 NaN af other \n", + "3 ... 0 NaN af other \n", + "4 ... 0 NaN fr female \n", + "... ... ... ... ... ... \n", + "998841 ... 1 2022-05-12 06:20:49+00:00 NaN male \n", + "998842 ... 1 2022-05-03 04:20:43+00:00 fr male \n", + "998843 ... 1 2022-03-27 12:15:02+00:00 de male \n", + "998844 ... 1 2022-04-20 15:12:38+00:00 ch male \n", + "998845 ... 1 2022-03-07 20:42:07+00:00 fr male \n", + "\n", + " gender_female gender_male gender_other country_fr has_tags \\\n", + "0 0 0 1 0.0 0 \n", + "1 0 0 1 0.0 0 \n", + "2 0 0 1 0.0 0 \n", + "3 0 0 1 0.0 0 \n", + "4 1 0 0 1.0 0 \n", + "... ... ... ... ... ... \n", + "998841 0 1 0 NaN 0 \n", + "998842 0 1 0 1.0 0 \n", + "998843 0 1 0 0.0 0 \n", + "998844 0 1 0 0.0 0 \n", + "998845 0 1 0 1.0 0 \n", + "\n", + " number_company \n", + "0 5 \n", + "1 5 \n", + "2 5 \n", + "3 5 \n", + "4 5 \n", + "... ... \n", + "998841 9 \n", + "998842 9 \n", + "998843 9 \n", + "998844 9 \n", + "998845 9 \n", + "\n", + "[998846 rows x 29 columns]" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_sport" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "848963c9-6129-4106-80b5-76bf814b70d1", + "metadata": {}, + "outputs": [], + "source": [ + "def mailing_consent(customer_sport):\n", + " df_graph = customer_sport.groupby([\"number_company\", \"already_purchased\"])[\"opt_in\"].mean().reset_index()\n", + " # Création du barplot groupé\n", + " fig, ax = plt.subplots(figsize=(10, 6))\n", + " \n", + " categories = df_graph[\"number_company\"].unique()\n", + " bar_width = 0.35\n", + " bar_positions = np.arange(len(categories))\n", + " \n", + " # Grouper les données par label et créer les barres groupées\n", + " for label in df_graph[\"already_purchased\"].unique():\n", + " label_data = df_graph[df_graph['already_purchased'] == label]\n", + " values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]\n", + " \n", + " label_printed = \"purchased\" if label else \"no purchase\"\n", + " ax.bar(bar_positions, values, bar_width, label=label_printed)\n", + " \n", + " # Mise à jour des positions des barres pour le prochain groupe\n", + " bar_positions = [pos + bar_width for pos in bar_positions]\n", + " \n", + " # Ajout des étiquettes, de la légende, etc.\n", + " ax.set_xlabel('Numero de compagnie')\n", + " ax.set_ylabel('Part de consentement (%)')\n", + " ax.set_title('Part de consentement au mailing selon les compagnies')\n", + " ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])\n", + " ax.set_xticklabels(categories)\n", + " ax.legend()\n", + " \n", + " # Affichage du plot\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8071891-e6f5-4d93-b039-9e99c20ec4b0", + "metadata": {}, + "outputs": [], + "source": [ + "def gender_bar(customer_sport):\n", + " company_genders = customer_sport.groupby(\"number_company\")[[\"gender_male\", \"gender_female\", \"gender_other\"]].mean().reset_index()\n", + " # Création du barplot\n", + " plt.bar(company_genders[\"number_company\"], company_genders[\"gender_male\"], label = \"Homme\")\n", + " plt.bar(company_genders[\"number_company\"], company_genders[\"gender_female\"], \n", + " bottom = company_genders[\"gender_male\"], label = \"Femme\")\n", + " \n", + " \n", + " # Ajout de titres et d'étiquettes\n", + " plt.xlabel('Company')\n", + " plt.ylabel(\"Part de clients de chaque sexe\")\n", + " plt.title(\"Sexe des clients de chaque compagnie de spectacle\")\n", + " plt.legend()\n", + " \n", + " # Affichage du barplot\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2fc30f1d-cf64-4efb-9442-4d97bb50b29f", + "metadata": {}, + "outputs": [], + "source": [ + "gender_bar()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4b3bb641-814b-4679-9a67-4eca87a920a6", + "metadata": {}, + "outputs": [], + "source": [ + "def country_bar(customer_sport):\n", + " company_country_fr = customer_sport.groupby(\"number_compagny\")[\"country_fr\"].mean().reset_index()\n", + " # Création du barplot\n", + " plt.bar(company_country_fr[\"number_company\"], company_country_fr[\"country_fr\"])\n", + " \n", + " # Ajout de titres et d'étiquettes\n", + " plt.xlabel('Company')\n", + " plt.ylabel(\"Part de clients français\")\n", + " plt.title(\"Nationalité des clients de chaque compagnie de spectacle\")\n", + " \n", + " # Affichage du barplot\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01258674-6b98-49e4-93f4-f4185964999f", + "metadata": {}, + "outputs": [], + "source": [ + "country_bar()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}