BDC-team-1/Exploration_billet_AJ.ipynb

2194 lines
60 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "5bf5c226",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b1a5b9d3",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "ecfa2219",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1a094277",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "30d77451-2df6-4c07-8b15-66e0e990ff03",
"metadata": {},
"outputs": [],
"source": [
"# Import cleaning and merge functions\n",
"\n",
"exec(open('0_Cleaning_and_merge_functions.py').read())\n",
"\n",
"exec(open('0_KPI_functions.py').read())\n",
"\n",
"# Ignore warning\n",
"warnings.filterwarnings('ignore')\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39",
"metadata": {},
"outputs": [],
"source": [
"def load_dataset_2(directory_path, file_name):\n",
" \"\"\"\n",
" This function loads csv file\n",
" \"\"\"\n",
" file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
"\n",
" # drop na :\n",
" #df = df.dropna(axis=1, thresh=len(df))\n",
" # if identifier in table : delete it\n",
" if 'identifier' in df.columns:\n",
" df = df.drop(columns = 'identifier')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "31ab76f0-fbb1-46f6-b359-97228620c207",
"metadata": {},
"outputs": [],
"source": [
"def export_in_temporary(df, output_name):\n",
" print('Export of dataset :', output_name)\n",
" FILE_PATH_OUT_S3 = \"ajoubrel-ensae/Temporary\" + \"/\" + output_name + '.csv'\n",
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" df.to_csv(file_out, index = False)"
]
},
{
"cell_type": "markdown",
"id": "ccf597b0-b459-4ea5-baf0-5ba8c90915e4",
"metadata": {},
"source": [
"# Cleaning target area and tags"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "fd88e294-e038-4cec-ad94-2bbbc10a4059",
"metadata": {},
"outputs": [],
"source": [
"def concatenate_names(names):\n",
" return ', '.join(names)\n",
"\n",
"def targets_KPI(df_target = None):\n",
" \n",
" df_target['target_name'] = df_target['target_name'].fillna('').str.lower()\n",
"\n",
" # Target name cotegory musees / \n",
" df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)\n",
" df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)\n",
" df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)\n",
" df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)\n",
" df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)\n",
" df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)\n",
" df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)\n",
" \n",
" # Target name category for sport compagnies\n",
" df_target['target_abonne'] = ((\n",
" df_target['target_name']\n",
" .str.contains('|'.join(['abo', 'adh']), case=False)\n",
" & ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)\n",
" ).astype(int))\n",
" \n",
" df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()\n",
" \n",
" target_agg = df_target.groupby('customer_id').agg(\n",
" nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes\n",
" # all_targets=('target_name', concatenate_names),\n",
" # all_target_types=('target_type_name', concatenate_names)\n",
" ).reset_index()\n",
"\n",
" target_agg['nb_targets'] = (target_agg['nb_targets'] - (target_agg['nb_targets'].mean())) / (target_agg['nb_targets'].std())\n",
" \n",
" target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')\n",
" \n",
" return target_agg"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1b124018-9637-463e-b512-15743ec9480b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_5/target_information.csv\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>nb_targets</th>\n",
" <th>target_jeune</th>\n",
" <th>target_optin</th>\n",
" <th>target_optout</th>\n",
" <th>target_scolaire</th>\n",
" <th>target_entreprise</th>\n",
" <th>target_famille</th>\n",
" <th>target_newsletter</th>\n",
" <th>target_abonne</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>160516</td>\n",
" <td>6.938264</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>160517</td>\n",
" <td>10.357387</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>160518</td>\n",
" <td>5.228703</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>160519</td>\n",
" <td>6.083483</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>160520</td>\n",
" <td>2.949288</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471205</th>\n",
" <td>6405875</td>\n",
" <td>-0.754762</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471206</th>\n",
" <td>6405905</td>\n",
" <td>-0.469835</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471207</th>\n",
" <td>6405909</td>\n",
" <td>-0.754762</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471208</th>\n",
" <td>6405917</td>\n",
" <td>-0.754762</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471209</th>\n",
" <td>6405963</td>\n",
" <td>-0.754762</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>471210 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id nb_targets target_jeune target_optin target_optout \\\n",
"0 160516 6.938264 0 1 0 \n",
"1 160517 10.357387 0 1 1 \n",
"2 160518 5.228703 0 1 1 \n",
"3 160519 6.083483 0 1 1 \n",
"4 160520 2.949288 0 1 0 \n",
"... ... ... ... ... ... \n",
"471205 6405875 -0.754762 0 0 1 \n",
"471206 6405905 -0.469835 0 0 1 \n",
"471207 6405909 -0.754762 0 0 1 \n",
"471208 6405917 -0.754762 0 0 1 \n",
"471209 6405963 -0.754762 0 0 1 \n",
"\n",
" target_scolaire target_entreprise target_famille target_newsletter \\\n",
"0 0 1 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 1 0 \n",
"4 0 0 0 0 \n",
"... ... ... ... ... \n",
"471205 0 0 0 0 \n",
"471206 0 0 0 0 \n",
"471207 0 0 0 0 \n",
"471208 0 0 0 0 \n",
"471209 0 0 0 0 \n",
"\n",
" target_abonne \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"... ... \n",
"471205 0 \n",
"471206 0 \n",
"471207 0 \n",
"471208 0 \n",
"471209 0 \n",
"\n",
"[471210 rows x 10 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"targets_KPI(display_input_databases('5', file_name = \"target_information\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c75efea3-b5e8-4a7a-bed4-dd64ae9ff9f2",
"metadata": {},
"outputs": [],
"source": [
"#export_in_temporary(target_agg, 'Target_kpi_concatenate')"
]
},
{
"cell_type": "markdown",
"id": "5d91263e-8a97-4cb1-8d94-db8ab0b77cdf",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Brouillon"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5e864b1-adad-4267-b956-3f7ef371d677",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def display_covering_time(df, company, datecover):\n",
" \"\"\"\n",
" This function draws the time coverage of each company\n",
" \"\"\"\n",
" min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
" max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
" datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
" print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
" return datecover\n",
"\n",
"\n",
"def compute_time_intersection(datecover):\n",
" \"\"\"\n",
" This function returns the time coverage for all companies\n",
" \"\"\"\n",
" timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
" intersection = set.intersection(*timestamps_sets)\n",
" intersection_list = list(intersection)\n",
" formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
" return sorted(formated_dates)\n",
"\n",
"\n",
"def df_coverage_modelization(sport, coverage_features = 0.7):\n",
" \"\"\"\n",
" This function returns start_date, end_of_features and final dates\n",
" that help to construct train and test datasets\n",
" \"\"\"\n",
" datecover = {}\n",
" for company in sport:\n",
" df_products_purchased_reduced = display_input_databases(company, file_name = \"products_purchased_reduced\",\n",
" datetime_col = ['purchase_date'])\n",
" datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
" #print(datecover.keys())\n",
" dt_coverage = compute_time_intersection(datecover)\n",
" start_date = dt_coverage[0]\n",
" end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
" final_date = dt_coverage[-1]\n",
" return start_date, end_of_features, final_date\n",
" "
]
},
{
"cell_type": "markdown",
"id": "2435097a-95a5-43e1-84d0-7f6b701441ba",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Bases non communes : mise à plat"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384",
"metadata": {},
"outputs": [],
"source": [
"companies = {'musee' : ['1', '2', '3', '4'],\n",
" 'sport': ['5', '6', '7', '8', '9'],\n",
" 'musique' : ['10', '11', '12', '13', '14']}\n",
"\n",
"all_companies = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35ac004f-c191-4f45-a4b1-6d993d9ec38c",
"metadata": {},
"outputs": [],
"source": [
"companies_databases = pd.DataFrame()\n",
"\n",
"for i in all_companies:\n",
" company_databases = pd.DataFrame({'company_number' : [i]})\n",
"\n",
" BUCKET = \"bdc2324-data/\"+i\n",
" for base in fs.ls(BUCKET):\n",
" match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
" if match:\n",
" nom_base = match.group(3)\n",
" company_databases[nom_base] = 1\n",
"\n",
" companies_databases = pd.concat([companies_databases, company_databases])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8986e477-e6c5-4d6c-83b2-2c90c134b599",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option(\"display.max_columns\", None)\n",
"companies_databases\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8fecc3bb-4c03-4144-97c5-615224d9729e",
"metadata": {},
"outputs": [],
"source": [
"pd.reset_option(\"display.max_columns\")"
]
},
{
"cell_type": "markdown",
"id": "0294ce71-840e-458b-8ffa-cadabbc6da21",
"metadata": {},
"source": [
"# Debut Travail 25/02"
]
},
{
"cell_type": "markdown",
"id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Base communes au types Musée"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b",
"metadata": {},
"outputs": [],
"source": [
"companies['musee']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5080f66e-f779-410a-876d-b4fe2795e17e",
"metadata": {},
"outputs": [],
"source": [
"for i in companies['musique']:\n",
" BUCKET = \"bdc2324-data/\"+i\n",
" liste_base = []\n",
" for base in fs.ls(BUCKET):\n",
" match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
" if match:\n",
" nom_base = match.group(3)\n",
" liste_base.append(nom_base)\n",
" globals()['base_'+i] = liste_base\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abd477e1-7479-4c88-a5aa-f987af3f5b79",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a",
"metadata": {},
"outputs": [],
"source": [
"len(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d058b21-a538-4f59-aefb-ef7966f73fdc",
"metadata": {},
"outputs": [],
"source": [
"df1_tags = load_dataset_2(\"1\", \"tags\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa441f99-733c-4675-8676-bed4682d3324",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6767a750-14a4-4c05-903e-d2f07170825b",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus = load_dataset_2(\"1\", \"customersplus\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "125e9145-a815-46fd-bdf4-07589508b259",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c17a6976-792f-474d-bcff-c89396eddb3f",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecfc155a-cb42-46ec-8da5-33fdcd087355",
"metadata": {},
"outputs": [],
"source": [
"len(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "071410b8-950d-4fcc-b2b9-57415253c286",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc",
"metadata": {},
"outputs": [],
"source": [
"?np.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_tags)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980",
"metadata": {},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n",
"\n",
"print(df1_tags['name'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a59bf932-5b54-4600-81f5-c55ac93ae510",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(df1_tags['name'])"
]
},
{
"cell_type": "markdown",
"id": "76bffba1-5f7e-4308-9224-437ca66148f8",
"metadata": {},
"source": [
"## KPI sur target_type"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6daf22e-6583-4431-a467-660a1dd4e5a4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d91d5895",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n"
]
},
{
"cell_type": "markdown",
"id": "c58b17d3",
"metadata": {},
"source": [
"Raisonnement : on prends les target_type qui représente 90% des clients et on fait des catégories dessus."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6930bff5",
"metadata": {},
"outputs": [],
"source": [
"def print_main_target(tenant_id, nb_print = 40):\n",
" df_target = display_input_databases(tenant_id, \"target_information\")\n",
"\n",
" print('Nombre de ciblage : ', len(df_target))\n",
" nb_customers = df_target['customer_id'].nunique()\n",
" print('Nombre de client avec étiquette target : ', nb_customers) \n",
"\n",
" nb_custumers_per_target = df_target.groupby(\"target_name\")['customer_id'].count().reset_index().sort_values('customer_id', ascending=False)\n",
" nb_custumers_per_target['cumulative_customers'] = nb_custumers_per_target['customer_id'].cumsum()/len(df_target)\n",
" nb_custumers_per_target['customer_id'] = nb_custumers_per_target['customer_id']/nb_customers\n",
"\n",
" return nb_custumers_per_target.head(nb_print)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e7ee1a0",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"pd.set_option(\"max_colwidth\", None)\n",
"print_main_target('1', 60)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19f3a2dd-ba3d-4dec-8e10-fed544ab6a53",
"metadata": {},
"outputs": [],
"source": [
"pd.reset_option('display.max_rows')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b57a28ac",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('2', 25)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a65991f",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('3', 70)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f34b8bf",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('4', 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52b24d66-92ad-4421-a62b-5cba837f1893",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40fe3676",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"\n",
"\n",
"print_main_target('5', 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "820d3600-379b-4245-a977-f1f1fa1f1839",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('6', 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86f64a1b-763a-4e43-9601-a38c80392d47",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('7', 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fbf2ea42-515a-4cdf-a4c1-50f99c379ed9",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('8', 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9684045c-4e25-4952-b099-a559baa5d749",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('9', 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf8f7816-e7f3-4b7a-a987-8350a76eb140",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('10', 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76c818a5-3c52-4d97-ac81-b7f3f89092bd",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('11', 100)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "603b11e4-5d76-4699-a1b2-e795929edc04",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('12', 100)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa93aecd-d117-481e-8507-15e49937ce14",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('13', 100)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a115ebcf-4488-47f3-9d7e-75a1fca52f0f",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('14', 100)\n"
]
},
{
"cell_type": "markdown",
"id": "605cced5-052f-4a99-ac26-020c5d2ab633",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## KPI sur tags"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "916c3e2b-04d3-4877-b894-8f26f10d926e",
"metadata": {},
"outputs": [],
"source": [
"customersplus = load_dataset_2(\"4\", \"customersplus\")[['id', 'structure_id']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46847b24-15a4-464e-969f-f16ed3653f1f",
"metadata": {},
"outputs": [],
"source": [
"structure_tag_mappings = load_dataset_2('4', \"structure_tag_mappings\")[['structure_id', 'tag_id']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c10c69d-735f-453e-96bf-750697d965d0",
"metadata": {},
"outputs": [],
"source": [
"customersplus[customersplus['structure_id'].notna()]['structure_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b0e77b3-5f16-4484-9564-7d3826583418",
"metadata": {},
"outputs": [],
"source": [
"len(customersplus[customersplus['structure_id'].notna()])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfa27722-37f9-435a-8221-8aa6f9a4a107",
"metadata": {},
"outputs": [],
"source": [
"structure_tag_mappings['structure_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2daabdd5-31e3-4918-9856-9bbc30cde602",
"metadata": {},
"outputs": [],
"source": [
"def tags_information(tenant_id, first_tags):\n",
"\n",
" customersplus = load_dataset_2(tenant_id, \"customersplus\")[['id', 'structure_id']]\n",
" customersplus.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
" tags = load_dataset_2(tenant_id, \"tags\")[['id', 'name']]\n",
" tags.rename(columns = {'id' : 'tag_id', 'name' : 'tag_name'}, inplace = True)\n",
" structure_tag_mappings = load_dataset_2(tenant_id, \"structure_tag_mappings\")[['structure_id', 'tag_id']]\n",
" \n",
" customer_tags = pd.merge(customersplus, structure_tag_mappings, on = 'structure_id', how = 'left')\n",
" customer_tags = pd.merge(customer_tags, tags, on = 'tag_id', how = 'inner')\n",
" \n",
" nb_customers_with_tag = customer_tags['customer_id'].nunique()\n",
" \n",
" print('Nombre de client avec tag : ', nb_customers_with_tag)\n",
" print('Proportion de clients avec tags : ', nb_customers_with_tag/len(customersplus))\n",
" print('Moyenne de tags par client : ', len(customer_tags)/nb_customers_with_tag)\n",
" \n",
" info = customer_tags.groupby(['tag_id', 'tag_name'])['customer_id'].count().reset_index().sort_values('customer_id', ascending = False).head(first_tags)\n",
"\n",
" return info"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b9f5f71-a927-4cc8-bb0c-9538e28d3553",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"1\", 20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd5bef41-1774-4601-86b5-b7c1aea8f1d2",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"2\", 20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c2dc3e6-1418-44db-a8c0-4a9d59ec5232",
"metadata": {},
"outputs": [],
"source": [
"load_dataset_2(\"2\", \"tags\")[['id', 'name']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c7b2c670-7122-4f67-b1aa-8c80a10f16d8",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"3\", 20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76639995-252d-4a58-83d8-c0c00900c3a9",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"4\", 20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07e91791-d4d4-42b1-ac18-22d3b0b9f7bd",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"101\", 20)"
]
},
{
"cell_type": "markdown",
"id": "87d131cd-ead0-4ef4-a8ee-b09022d08ffa",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## KPI product"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26582be9-cfd1-48ea-a0a7-31101fdeb9d1",
"metadata": {},
"outputs": [],
"source": [
"tenant_id = \"1\"\n",
"\n",
"df_product = display_databases(tenant_id, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
"\n",
"df_product.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "533bf499-dd56-4d29-b261-ca1e4928c9c7",
"metadata": {},
"outputs": [],
"source": [
"nb_tickets_per_events = df_product.groupby(['name_event_types', 'name_events'])['ticket_id'].count().reset_index().sort_values('ticket_id', ascending = False)\n",
"nb_tickets_per_events['prop_tickets'] = round(nb_tickets_per_events['ticket_id']/len(df_product), 3)\n",
"nb_tickets_per_events"
]
},
{
"cell_type": "markdown",
"id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Fin travail 25/02"
]
},
{
"cell_type": "markdown",
"id": "c437eaec",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
"id": "a1c1fc39",
"metadata": {},
"source": [
"## Chargement données"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66f8c17b",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c08e6798",
"metadata": {},
"outputs": [],
"source": [
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_database_filtered)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "675f518d",
"metadata": {},
"outputs": [],
"source": [
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
{
"cell_type": "markdown",
"id": "e855f403",
"metadata": {},
"source": [
"## customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "91a8f8c4",
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.info())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2fda171d",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "205eeeab",
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "634282c5",
"metadata": {},
"outputs": [],
"source": [
"a = info_colonnes_dataframe(df1_customersplus)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0e8d4133",
"metadata": {},
"outputs": [],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1268ad5a",
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd41dc80",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_customersplus_clean = df1_customersplus.copy()\n",
"\n",
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
"\n",
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "64d0f76b",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## tickets.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e683711",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7b9a52e",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "568280e8",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets.isna().sum()/len(df1_tickets)*100"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29ecec90",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "22bb5de4",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## suppliers.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a9a91f4",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bab4758a",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5fff251",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b09e2a3",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecee7cdc",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_clean"
]
},
{
"cell_type": "markdown",
"id": "c8e6e69b",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## type_ofs.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a6cff1f",
"metadata": {},
"outputs": [],
"source": [
"df1_type_ofs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "93630b41",
"metadata": {},
"outputs": [],
"source": [
"df1_type_ofs.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f94481a",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "1b2811e2",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## purchases.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2455d2e1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df1_purchases"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f9a159d",
"metadata": {},
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "db201bf7",
"metadata": {},
"outputs": [],
"source": [
"# Nettoyage purchase_date\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd436fca",
"metadata": {},
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "83435862",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
]
},
{
"cell_type": "markdown",
"id": "f210e730",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Fusion de l'ensemble des données billétiques"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1f8b3aa7",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec fournisseurs\n",
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec type de tickets\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec achats\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "83a4d021",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
"id": "56e6ebd1",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Utilisation de fonctions"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88fcde4b",
"metadata": {},
"outputs": [],
"source": [
"# Créer un DataFrame exemple\n",
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
"\n",
"# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
"df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
"df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
"\n",
"test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
"\n",
"test.info()"
]
},
{
"cell_type": "markdown",
"id": "818f69db",
"metadata": {},
"source": [
"## Nettoyage, selection et fusion"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c9654eda",
"metadata": {},
"outputs": [],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f2b620c",
"metadata": {},
"outputs": [],
"source": [
"df1_ticket_information.info()"
]
},
{
"cell_type": "markdown",
"id": "637bdb72",
"metadata": {},
"source": [
"# Customer information"
]
},
{
"cell_type": "markdown",
"id": "14c52894",
"metadata": {},
"source": [
"## Target area"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d83abfbf",
"metadata": {},
"outputs": [],
"source": [
"# Target.csv cleaning\n",
"df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
"df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
"\n",
"# target_type cleaning\n",
"df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
"\n",
"#customer_target_mappings cleaning\n",
"df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
"\n",
"# Merge target et target_type\n",
"df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
"df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
"\n",
"# Merge\n",
"df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
"df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90d71b2c",
"metadata": {},
"outputs": [],
"source": [
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
"\n",
"# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
"df1_targets_test.mean()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2301de1e",
"metadata": {},
"outputs": [],
"source": [
"df1_targets_full.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "75fbc2f7",
"metadata": {},
"outputs": [],
"source": [
"# Catégorisation des target_name\n",
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.probability import FreqDist\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55cddf92",
"metadata": {},
"outputs": [],
"source": [
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"\n",
"# Appliquer le prétraitement à la colonne de texte\n",
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
"\n",
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
"\n",
"# Calculer la fréquence des mots\n",
"freq_dist = FreqDist(all_words)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fd98a85",
"metadata": {},
"outputs": [],
"source": [
"# Affichage des mots les plus fréquents\n",
"print(\"Mots les plus fréquents:\")\n",
"for mot, freq in freq_dist.most_common(15):\n",
" print(f\"{mot}: {freq}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf94bb1d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"# Création de la DataFrame d'exemple\n",
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Fonction pour prétraiter le texte\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
"\n",
"# Afficher le résultat\n",
"print(df)\n"
]
},
{
"cell_type": "markdown",
"id": "711d3884",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Campaign area"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c25b5295",
"metadata": {},
"outputs": [],
"source": [
"# campaign_stats cleaning \n",
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
"cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
"\n",
"# campaigns cleaning\n",
"df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
"cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
"\n",
"# Merge \n",
"df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
"df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2a3de6a5",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns_full.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3fc1f446",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "markdown",
"id": "20e69ee3",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Link area"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d9cbdbce",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c07459f0",
"metadata": {},
"outputs": [],
"source": [
"df1_link_stats"
]
},
{
"cell_type": "markdown",
"id": "80ae4c42",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Supplier"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b50b8f95",
"metadata": {},
"outputs": [],
"source": [
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
"def suppliers_exploration(suppliers = None) : \n",
" \n",
" # Taux de NaN pour ces colonnes\n",
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
"\n",
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
" 'label_na' : [label_na],\n",
" 'itr_na' : [itr_na],\n",
" 'commission_na' : [commission_na]})\n",
"\n",
" return suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e292935",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05b6f2b0",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c9324d80",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"liste_folders = fs.ls(BUCKET)\n",
"\n",
"liste_files = []\n",
"for company_folder in liste_folders : \n",
" liste_files.extend(fs.ls(company_folder))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10304058",
"metadata": {},
"outputs": [],
"source": [
"liste_database_select = ['suppliers']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ffa423e5",
"metadata": {},
"outputs": [],
"source": [
"# loop to create dataframes from file 2\n",
"def database_loading(database_name = None):\n",
" files_path = database_name\n",
" \n",
" client_number = files_path.split(\"/\")[1]\n",
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
" \n",
" current_path = files_path\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
"\n",
" return df, client_number"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70bdc88d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a0f567d",
"metadata": {},
"outputs": [],
"source": [
"df_all = pd.DataFrame()\n",
"\n",
"for link in liste_suppliers:\n",
" \n",
" df_supplier, tenant_id = database_loading(link)\n",
" \n",
" df_supplier['tenant_id'] = int(tenant_id)\n",
"\n",
" df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1522d8cd",
"metadata": {},
"outputs": [],
"source": [
"# df_all[df_all['tenant_id'] == 101]['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0e42a61",
"metadata": {},
"outputs": [],
"source": [
"liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
"# vad = vente à distance\n",
"df_all['name'] = df_all['name'].fillna('')\n",
"\n",
"df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d299ae91",
"metadata": {},
"outputs": [],
"source": [
"df_all.groupby('tenant_id')['canal_vente_internet'].max()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}