{ "cells": [ { "cell_type": "markdown", "id": "5bf5c226", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "b1a5b9d3", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "import warnings" ] }, { "cell_type": "markdown", "id": "ecfa2219", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "1a094277", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 3, "id": "30d77451-2df6-4c07-8b15-66e0e990ff03", "metadata": {}, "outputs": [], "source": [ "# Import cleaning and merge functions\n", "\n", "exec(open('0_Cleaning_and_merge_functions.py').read())\n", "\n", "exec(open('0_KPI_functions.py').read())\n", "\n", "# Ignore warning\n", "warnings.filterwarnings('ignore')\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39", "metadata": {}, "outputs": [], "source": [ "def load_dataset_2(directory_path, file_name):\n", " \"\"\"\n", " This function loads csv file\n", " \"\"\"\n", " file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n", " with fs.open(file_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in, sep=\",\")\n", "\n", " # drop na :\n", " #df = df.dropna(axis=1, thresh=len(df))\n", " # if identifier in table : delete it\n", " if 'identifier' in df.columns:\n", " df = df.drop(columns = 'identifier')\n", " return df" ] }, { "cell_type": "code", "execution_count": 5, "id": "31ab76f0-fbb1-46f6-b359-97228620c207", "metadata": {}, "outputs": [], "source": [ "def export_in_temporary(df, output_name):\n", " print('Export of dataset :', output_name)\n", " FILE_PATH_OUT_S3 = \"ajoubrel-ensae/Temporary\" + \"/\" + output_name + '.csv'\n", " with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", " df.to_csv(file_out, index = False)" ] }, { "cell_type": "markdown", "id": "ccf597b0-b459-4ea5-baf0-5ba8c90915e4", "metadata": {}, "source": [ "# Cleaning target area and tags" ] }, { "cell_type": "code", "execution_count": 14, "id": "fd88e294-e038-4cec-ad94-2bbbc10a4059", "metadata": {}, "outputs": [], "source": [ "def concatenate_names(names):\n", " return ', '.join(names)\n", "\n", "def targets_KPI(df_target = None):\n", " \n", " df_target['target_name'] = df_target['target_name'].fillna('').str.lower()\n", "\n", " # Target name cotegory musees / \n", " df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)\n", " df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)\n", " df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)\n", " df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)\n", " df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)\n", " df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)\n", " df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)\n", " \n", " # Target name category for sport compagnies\n", " df_target['target_abonne'] = ((\n", " df_target['target_name']\n", " .str.contains('|'.join(['abo', 'adh']), case=False)\n", " & ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)\n", " ).astype(int))\n", " \n", " df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()\n", " \n", " target_agg = df_target.groupby('customer_id').agg(\n", " nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes\n", " # all_targets=('target_name', concatenate_names),\n", " # all_target_types=('target_type_name', concatenate_names)\n", " ).reset_index()\n", "\n", " target_agg['nb_targets'] = (target_agg['nb_targets'] - (target_agg['nb_targets'].mean())) / (target_agg['nb_targets'].std())\n", " \n", " target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')\n", " \n", " return target_agg" ] }, { "cell_type": "code", "execution_count": 15, "id": "1b124018-9637-463e-b512-15743ec9480b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_5/target_information.csv\n" ] }, { "data": { "text/html": [ "
\n", " | customer_id | \n", "nb_targets | \n", "target_jeune | \n", "target_optin | \n", "target_optout | \n", "target_scolaire | \n", "target_entreprise | \n", "target_famille | \n", "target_newsletter | \n", "target_abonne | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "160516 | \n", "6.938264 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "
1 | \n", "160517 | \n", "10.357387 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
2 | \n", "160518 | \n", "5.228703 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
3 | \n", "160519 | \n", "6.083483 | \n", "0 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "
4 | \n", "160520 | \n", "2.949288 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
471205 | \n", "6405875 | \n", "-0.754762 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471206 | \n", "6405905 | \n", "-0.469835 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471207 | \n", "6405909 | \n", "-0.754762 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471208 | \n", "6405917 | \n", "-0.754762 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471209 | \n", "6405963 | \n", "-0.754762 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
471210 rows × 10 columns
\n", "