{ "cells": [ { "cell_type": "markdown", "id": "5bf5c226", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 22, "id": "b1a5b9d3", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "import warnings\n", "import io\n", "import matplotlib.pyplot as plt\n" ] }, { "cell_type": "markdown", "id": "ecfa2219", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "1a094277", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 3, "id": "30d77451-2df6-4c07-8b15-66e0e990ff03", "metadata": {}, "outputs": [], "source": [ "# Import cleaning and merge functions\n", "\n", "exec(open('0_Cleaning_and_merge_functions.py').read())\n", "\n", "exec(open('0_KPI_functions.py').read())\n", "\n", "# Ignore warning\n", "warnings.filterwarnings('ignore')\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39", "metadata": {}, "outputs": [], "source": [ "def load_dataset_2(directory_path, file_name):\n", " \"\"\"\n", " This function loads csv file\n", " \"\"\"\n", " file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n", " with fs.open(file_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in, sep=\",\")\n", "\n", " # drop na :\n", " #df = df.dropna(axis=1, thresh=len(df))\n", " # if identifier in table : delete it\n", " if 'identifier' in df.columns:\n", " df = df.drop(columns = 'identifier')\n", " return df" ] }, { "cell_type": "code", "execution_count": 5, "id": "31ab76f0-fbb1-46f6-b359-97228620c207", "metadata": {}, "outputs": [], "source": [ "def export_in_temporary(df, output_name):\n", " print('Export of dataset :', output_name)\n", " FILE_PATH_OUT_S3 = \"ajoubrel-ensae/Temporary\" + \"/\" + output_name + '.csv'\n", " with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", " df.to_csv(file_out, index = False)" ] }, { "cell_type": "code", "execution_count": 15, "id": "108fc5ef-c56a-4f03-a867-943d9d6492fd", "metadata": {}, "outputs": [], "source": [ "def save_file_s3(File_name, type_of_activity):\n", " image_buffer = io.BytesIO()\n", " plt.savefig(image_buffer, format='png')\n", " image_buffer.seek(0)\n", " FILE_PATH = f\"projet-bdc2324-team1/stat_desc/{type_of_activity}/\"\n", " FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '.png'\n", " with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:\n", " s3_file.write(image_buffer.read())\n", " plt.close()" ] }, { "cell_type": "code", "execution_count": 37, "id": "c99b9cb7-00ab-41cf-bde7-38676f5a3d02", "metadata": {}, "outputs": [], "source": [ "def taux_partner(campany_nb) :\n", "\n", " is_partner = load_dataset_2(campany_nb, 'customersplus')[['is_partner']].astype(int)\n", " percentage_partner = (is_partner['is_partner'].mean()) * 100\n", " \n", " return percentage_partner\n", " \n" ] }, { "cell_type": "code", "execution_count": 43, "id": "6facc27e-f95d-49c5-afe0-8c34b3a0cb94", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0\n" ] } ], "source": [ "a = 0\n", "for nb in [\"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"10\", \"11\", \"12\", \"13\", \"14\"]:\n", " a += taux_partner(nb)\n", "\n", "print(a/14)" ] }, { "cell_type": "markdown", "id": "ccf597b0-b459-4ea5-baf0-5ba8c90915e4", "metadata": {}, "source": [ "# Cleaning target area and tags" ] }, { "cell_type": "code", "execution_count": 14, "id": "fd88e294-e038-4cec-ad94-2bbbc10a4059", "metadata": {}, "outputs": [], "source": [ "def concatenate_names(names):\n", " return ', '.join(names)\n", "\n", "def targets_KPI(df_target = None):\n", " \n", " df_target['target_name'] = df_target['target_name'].fillna('').str.lower()\n", "\n", " # Target name cotegory musees / \n", " df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)\n", " df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)\n", " df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)\n", " df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)\n", " df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)\n", " df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)\n", " df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)\n", " \n", " # Target name category for sport compagnies\n", " df_target['target_abonne'] = ((\n", " df_target['target_name']\n", " .str.contains('|'.join(['abo', 'adh']), case=False)\n", " & ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)\n", " ).astype(int))\n", " \n", " df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()\n", " \n", " target_agg = df_target.groupby('customer_id').agg(\n", " nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes\n", " # all_targets=('target_name', concatenate_names),\n", " # all_target_types=('target_type_name', concatenate_names)\n", " ).reset_index()\n", "\n", " target_agg['nb_targets'] = (target_agg['nb_targets'] - (target_agg['nb_targets'].mean())) / (target_agg['nb_targets'].std())\n", " \n", " target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')\n", " \n", " return target_agg" ] }, { "cell_type": "code", "execution_count": 15, "id": "1b124018-9637-463e-b512-15743ec9480b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_5/target_information.csv\n" ] }, { "data": { "text/html": [ "
\n", " | customer_id | \n", "nb_targets | \n", "target_jeune | \n", "target_optin | \n", "target_optout | \n", "target_scolaire | \n", "target_entreprise | \n", "target_famille | \n", "target_newsletter | \n", "target_abonne | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "160516 | \n", "6.938264 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
1 | \n", "160517 | \n", "10.357387 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
2 | \n", "160518 | \n", "5.228703 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
3 | \n", "160519 | \n", "6.083483 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "
4 | \n", "160520 | \n", "2.949288 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
471205 | \n", "6405875 | \n", "-0.754762 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471206 | \n", "6405905 | \n", "-0.469835 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471207 | \n", "6405909 | \n", "-0.754762 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471208 | \n", "6405917 | \n", "-0.754762 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471209 | \n", "6405963 | \n", "-0.754762 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471210 rows × 10 columns
\n", "\n", " | category_name | \n", "list_target_name | \n", "
---|---|---|
0 | \n", "target_jeune | \n", "[jeunesses vaudoises, etudiant hors ssc 22-23, student supporter club, etudiants hors epfl, student supporter club ehl, etudiants] | \n", "
1 | \n", "target_optin | \n", "[] | \n", "
2 | \n", "target_optout | \n", "[] | \n", "
3 | \n", "target_scolaire | \n", "[] | \n", "
4 | \n", "target_entreprise | \n", "[prospects b2b, prospects survey b2b, b2b à enlever, prospect b2b fc 06.10, prospect b2b rk 06.10] | \n", "
5 | \n", "target_famille | \n", "[family corner - 20.11.22, family corner - saison 19-20] | \n", "
6 | \n", "target_newsletter | \n", "[consentements nl lhc, newsletter 2022, b2b à enlever, abonnés newsletter - saison 21-22] | \n", "
7 | \n", "target_abonne | \n", "[abonnés 23/24, non renouvellement abo 23-24 debout (23.06), résas abos debout - 29.06, abonnés - assis, sondage reconduction abos, sondage nouveaux abos, abonnés b2c - relance 1, abonnés b2c - relance 2, relance abos assis, non renouvellement abo 23-24 assis (23.06), résas abos assis - 29.06, abonnés - playoffs, abonnés - debout, abonnés non vip - saison 22-23, avantage abonné - ticket, paiements abos, campagneabosconcours - abonnés 21-22 en attente, campagneabosconcours - abonnés 22-23, nouveaux abonnés - saison 22-23, abonnements - relance 15.04, abonnements - relance 13.04, abonnements - relance 11.04, abonnements - relance 07.04, abonnés newsletter - saison 21-22, abonnés 1-3 ans, abonnés 1-3 ans - relance, abonnés - non-renouvellement 22-23, abonnés - renoncement playoffs 22, abonnés 5 ans - relance, abonnés - version finale] | \n", "