2024-01-10 19:19:51 +01:00
{
"cells": [
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "5bf5c226",
2024-01-10 19:19:51 +01:00
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
2024-02-26 22:47:36 +01:00
"execution_count": 1,
2024-02-10 22:46:56 +01:00
"id": "b1a5b9d3",
2024-01-10 19:19:51 +01:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
2024-01-13 10:38:10 +01:00
"import numpy as np\n",
"import os\n",
"import s3fs\n",
2024-02-25 23:53:10 +01:00
"import re\n",
"import warnings"
2024-01-10 19:19:51 +01:00
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "ecfa2219",
2024-01-10 19:19:51 +01:00
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
2024-02-26 22:47:36 +01:00
"execution_count": 2,
2024-02-10 22:46:56 +01:00
"id": "1a094277",
2024-01-10 19:19:51 +01:00
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
2024-01-13 10:38:10 +01:00
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
2024-01-10 19:19:51 +01:00
]
},
2024-02-25 18:33:24 +01:00
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": 3,
2024-02-25 18:33:24 +01:00
"id": "30d77451-2df6-4c07-8b15-66e0e990ff03",
"metadata": {},
"outputs": [],
"source": [
"# Import cleaning and merge functions\n",
2024-03-13 23:24:38 +01:00
"\n",
"exec(open('0_Cleaning_and_merge_functions.py').read())\n",
"\n",
2024-02-25 18:33:24 +01:00
"exec(open('0_KPI_functions.py').read())\n",
"\n",
"# Ignore warning\n",
"warnings.filterwarnings('ignore')\n"
]
},
{
"cell_type": "code",
2024-02-26 22:47:36 +01:00
"execution_count": 4,
2024-02-25 18:33:24 +01:00
"id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39",
"metadata": {},
"outputs": [],
"source": [
"def load_dataset_2(directory_path, file_name):\n",
" \"\"\"\n",
" This function loads csv file\n",
" \"\"\"\n",
" file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
"\n",
" # drop na :\n",
" #df = df.dropna(axis=1, thresh=len(df))\n",
" # if identifier in table : delete it\n",
" if 'identifier' in df.columns:\n",
" df = df.drop(columns = 'identifier')\n",
" return df"
]
},
{
2024-03-13 23:24:38 +01:00
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": 5,
2024-03-13 23:24:38 +01:00
"id": "31ab76f0-fbb1-46f6-b359-97228620c207",
2024-03-04 23:30:25 +01:00
"metadata": {},
2024-03-13 23:24:38 +01:00
"outputs": [],
2024-02-25 18:33:24 +01:00
"source": [
2024-03-13 23:24:38 +01:00
"def export_in_temporary(df, output_name):\n",
" print('Export of dataset :', output_name)\n",
2024-03-23 12:51:18 +01:00
" FILE_PATH_OUT_S3 = \"ajoubrel-ensae/Temporary\" + \"/\" + output_name + '.csv'\n",
2024-03-13 23:24:38 +01:00
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" df.to_csv(file_out, index = False)"
2024-02-25 18:33:24 +01:00
]
},
{
2024-03-13 23:24:38 +01:00
"cell_type": "markdown",
"id": "ccf597b0-b459-4ea5-baf0-5ba8c90915e4",
2024-02-25 18:33:24 +01:00
"metadata": {},
"source": [
2024-03-13 23:24:38 +01:00
"# Cleaning target area and tags"
2024-02-25 18:33:24 +01:00
]
},
{
"cell_type": "code",
2024-03-23 12:51:18 +01:00
"execution_count": 14,
"id": "fd88e294-e038-4cec-ad94-2bbbc10a4059",
2024-02-25 18:33:24 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-02-25 18:33:24 +01:00
"source": [
2024-03-23 12:51:18 +01:00
"def concatenate_names(names):\n",
" return ', '.join(names)\n",
"\n",
"def targets_KPI(df_target = None):\n",
" \n",
" df_target['target_name'] = df_target['target_name'].fillna('').str.lower()\n",
"\n",
" # Target name cotegory musees / \n",
" df_target['target_jeune'] = df_target['target_name'].str.contains('|'.join(['jeune', 'pass_culture', 'etudiant', '12-25 ans', 'student', 'jeunesse']), case=False).astype(int)\n",
" df_target['target_optin'] = df_target['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)\n",
" df_target['target_optout'] = df_target['target_name'].str.contains('|'.join(['optout', 'unsubscribed']), case=False).astype(int)\n",
" df_target['target_scolaire'] = df_target['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)\n",
" df_target['target_entreprise'] = df_target['target_name'].str.contains('|'.join(['b2b', 'btob', 'cse']), case=False).astype(int)\n",
" df_target['target_famille'] = df_target['target_name'].str.contains('|'.join(['famille', 'enfants', 'family']), case=False).astype(int)\n",
" df_target['target_newsletter'] = df_target['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)\n",
" \n",
" # Target name category for sport compagnies\n",
" df_target['target_abonne'] = ((\n",
" df_target['target_name']\n",
" .str.contains('|'.join(['abo', 'adh']), case=False)\n",
" & ~df_target['target_name'].str.contains('|'.join(['hors abo', 'anciens abo']), case=False)\n",
" ).astype(int))\n",
" \n",
" df_target_categorie = df_target.groupby('customer_id')[['target_jeune', 'target_optin', 'target_optout', 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter', 'target_abonne']].max()\n",
" \n",
" target_agg = df_target.groupby('customer_id').agg(\n",
" nb_targets=('target_name', 'nunique') # Utilisation de tuples pour spécifier les noms de colonnes\n",
" # all_targets=('target_name', concatenate_names),\n",
" # all_target_types=('target_type_name', concatenate_names)\n",
" ).reset_index()\n",
"\n",
" target_agg['nb_targets'] = (target_agg['nb_targets'] - (target_agg['nb_targets'].mean())) / (target_agg['nb_targets'].std())\n",
" \n",
" target_agg = pd.merge(target_agg, df_target_categorie, how='left', on='customer_id')\n",
" \n",
" return target_agg"
2024-02-25 18:33:24 +01:00
]
},
{
"cell_type": "code",
2024-03-23 12:51:18 +01:00
"execution_count": 15,
"id": "1b124018-9637-463e-b512-15743ec9480b",
2024-02-25 18:33:24 +01:00
"metadata": {},
2024-03-13 23:24:38 +01:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-03-23 12:51:18 +01:00
"File path : projet-bdc2324-team1/0_Input/Company_5/target_information.csv\n"
2024-03-13 23:24:38 +01:00
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2024-03-23 12:51:18 +01:00
" <th>customer_id</th>\n",
" <th>nb_targets</th>\n",
2024-03-23 10:48:47 +01:00
" <th>target_jeune</th>\n",
" <th>target_optin</th>\n",
" <th>target_optout</th>\n",
" <th>target_scolaire</th>\n",
" <th>target_entreprise</th>\n",
" <th>target_famille</th>\n",
" <th>target_newsletter</th>\n",
2024-03-23 12:51:18 +01:00
" <th>target_abonne</th>\n",
2024-03-13 23:24:38 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2024-03-23 12:51:18 +01:00
" <th>0</th>\n",
" <td>160516</td>\n",
" <td>6.938264</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
2024-03-23 12:51:18 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
2024-03-23 12:51:18 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
2024-03-23 12:51:18 +01:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>160517</td>\n",
" <td>10.357387</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-23 12:51:18 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
2024-03-13 23:24:38 +01:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2024-03-23 12:51:18 +01:00
" <td>160518</td>\n",
" <td>5.228703</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
2024-03-23 12:51:18 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>160519</td>\n",
" <td>6.083483</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
2024-03-23 12:51:18 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
2024-03-13 23:24:38 +01:00
" </tr>\n",
" <tr>\n",
2024-03-23 12:51:18 +01:00
" <th>4</th>\n",
" <td>160520</td>\n",
" <td>2.949288</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-23 12:51:18 +01:00
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
2024-03-13 23:24:38 +01:00
" </tr>\n",
" <tr>\n",
2024-03-23 12:51:18 +01:00
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471205</th>\n",
" <td>6405875</td>\n",
" <td>-0.754762</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471206</th>\n",
" <td>6405905</td>\n",
" <td>-0.469835</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-13 23:24:38 +01:00
" <td>1</td>\n",
2024-03-23 12:51:18 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471207</th>\n",
" <td>6405909</td>\n",
" <td>-0.754762</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-13 23:24:38 +01:00
" <td>1</td>\n",
2024-03-23 10:48:47 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-13 23:24:38 +01:00
" </tr>\n",
" <tr>\n",
2024-03-23 12:51:18 +01:00
" <th>471208</th>\n",
" <td>6405917</td>\n",
" <td>-0.754762</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
2024-03-23 12:51:18 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>471209</th>\n",
" <td>6405963</td>\n",
" <td>-0.754762</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-23 10:48:47 +01:00
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2024-03-13 23:24:38 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2024-03-23 12:51:18 +01:00
"<p>471210 rows × 10 columns</p>\n",
2024-03-13 23:24:38 +01:00
"</div>"
],
"text/plain": [
2024-03-23 12:51:18 +01:00
" customer_id nb_targets target_jeune target_optin target_optout \\\n",
"0 160516 6.938264 0 1 0 \n",
"1 160517 10.357387 0 1 1 \n",
"2 160518 5.228703 0 1 1 \n",
"3 160519 6.083483 0 1 1 \n",
"4 160520 2.949288 0 1 0 \n",
"... ... ... ... ... ... \n",
"471205 6405875 -0.754762 0 0 1 \n",
"471206 6405905 -0.469835 0 0 1 \n",
"471207 6405909 -0.754762 0 0 1 \n",
"471208 6405917 -0.754762 0 0 1 \n",
"471209 6405963 -0.754762 0 0 1 \n",
"\n",
" target_scolaire target_entreprise target_famille target_newsletter \\\n",
"0 0 1 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 1 0 \n",
"4 0 0 0 0 \n",
"... ... ... ... ... \n",
"471205 0 0 0 0 \n",
"471206 0 0 0 0 \n",
"471207 0 0 0 0 \n",
"471208 0 0 0 0 \n",
"471209 0 0 0 0 \n",
2024-03-13 23:24:38 +01:00
"\n",
2024-03-23 12:51:18 +01:00
" target_abonne \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 \n",
"... ... \n",
"471205 0 \n",
"471206 0 \n",
"471207 0 \n",
"471208 0 \n",
"471209 0 \n",
"\n",
"[471210 rows x 10 columns]"
2024-03-13 23:24:38 +01:00
]
},
2024-03-23 12:51:18 +01:00
"execution_count": 15,
2024-03-13 23:24:38 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
2024-02-25 18:33:24 +01:00
"source": [
2024-03-23 12:51:18 +01:00
"targets_KPI(display_input_databases('5', file_name = \"target_information\"))"
2024-02-25 18:33:24 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-03-13 23:24:38 +01:00
"id": "c75efea3-b5e8-4a7a-bed4-dd64ae9ff9f2",
2024-02-25 18:33:24 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-02-25 18:33:24 +01:00
"source": [
2024-03-23 10:48:47 +01:00
"#export_in_temporary(target_agg, 'Target_kpi_concatenate')"
2024-02-25 18:33:24 +01:00
]
},
2024-03-14 23:35:25 +01:00
{
2024-03-23 10:48:47 +01:00
"cell_type": "markdown",
"id": "5d91263e-8a97-4cb1-8d94-db8ab0b77cdf",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-03-14 23:35:25 +01:00
"source": [
2024-03-23 10:48:47 +01:00
"# Brouillon"
2024-03-14 23:35:25 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5e864b1-adad-4267-b956-3f7ef371d677",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def display_covering_time(df, company, datecover):\n",
" \"\"\"\n",
" This function draws the time coverage of each company\n",
" \"\"\"\n",
" min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
" max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
" datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
" print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
" return datecover\n",
"\n",
"\n",
"def compute_time_intersection(datecover):\n",
" \"\"\"\n",
" This function returns the time coverage for all companies\n",
" \"\"\"\n",
" timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
" intersection = set.intersection(*timestamps_sets)\n",
" intersection_list = list(intersection)\n",
" formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
" return sorted(formated_dates)\n",
"\n",
"\n",
"def df_coverage_modelization(sport, coverage_features = 0.7):\n",
" \"\"\"\n",
" This function returns start_date, end_of_features and final dates\n",
" that help to construct train and test datasets\n",
" \"\"\"\n",
" datecover = {}\n",
" for company in sport:\n",
" df_products_purchased_reduced = display_input_databases(company, file_name = \"products_purchased_reduced\",\n",
" datetime_col = ['purchase_date'])\n",
" datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
" #print(datecover.keys())\n",
" dt_coverage = compute_time_intersection(datecover)\n",
" start_date = dt_coverage[0]\n",
" end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
" final_date = dt_coverage[-1]\n",
" return start_date, end_of_features, final_date\n",
" "
]
},
2024-02-25 18:33:24 +01:00
{
"cell_type": "markdown",
2024-03-13 23:24:38 +01:00
"id": "2435097a-95a5-43e1-84d0-7f6b701441ba",
2024-02-26 22:47:36 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-02-25 18:33:24 +01:00
"source": [
2024-03-13 23:24:38 +01:00
"# Bases non communes : mise à plat"
2024-02-25 23:53:10 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-03-13 23:24:38 +01:00
"id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384",
2024-02-25 23:53:10 +01:00
"metadata": {},
"outputs": [],
"source": [
2024-03-13 23:24:38 +01:00
"companies = {'musee' : ['1', '2', '3', '4'],\n",
" 'sport': ['5', '6', '7', '8', '9'],\n",
" 'musique' : ['10', '11', '12', '13', '14']}\n",
2024-02-25 23:53:10 +01:00
"\n",
2024-03-13 23:24:38 +01:00
"all_companies = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']"
2024-02-25 23:53:10 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-03-13 23:24:38 +01:00
"id": "35ac004f-c191-4f45-a4b1-6d993d9ec38c",
2024-02-25 23:53:10 +01:00
"metadata": {},
"outputs": [],
"source": [
2024-03-13 23:24:38 +01:00
"companies_databases = pd.DataFrame()\n",
2024-02-25 23:53:10 +01:00
"\n",
2024-03-13 23:24:38 +01:00
"for i in all_companies:\n",
" company_databases = pd.DataFrame({'company_number' : [i]})\n",
2024-02-25 23:53:10 +01:00
"\n",
2024-03-13 23:24:38 +01:00
" BUCKET = \"bdc2324-data/\"+i\n",
" for base in fs.ls(BUCKET):\n",
" match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
" if match:\n",
" nom_base = match.group(3)\n",
" company_databases[nom_base] = 1\n",
2024-02-25 23:53:10 +01:00
"\n",
2024-03-13 23:24:38 +01:00
" companies_databases = pd.concat([companies_databases, company_databases])"
2024-02-26 22:47:36 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-03-13 23:24:38 +01:00
"id": "8986e477-e6c5-4d6c-83b2-2c90c134b599",
2024-02-26 22:47:36 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-03-13 23:24:38 +01:00
"source": [
"pd.set_option(\"display.max_columns\", None)\n",
"companies_databases\n"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-03-13 23:24:38 +01:00
"id": "8fecc3bb-4c03-4144-97c5-615224d9729e",
"metadata": {},
"outputs": [],
"source": [
"pd.reset_option(\"display.max_columns\")"
]
},
{
"cell_type": "markdown",
"id": "0294ce71-840e-458b-8ffa-cadabbc6da21",
"metadata": {},
"source": [
"# Debut Travail 25/02"
]
},
{
"cell_type": "markdown",
"id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Base communes au types Musée"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b",
"metadata": {},
"outputs": [],
"source": [
"companies['musee']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5080f66e-f779-410a-876d-b4fe2795e17e",
"metadata": {},
"outputs": [],
"source": [
"for i in companies['musique']:\n",
" BUCKET = \"bdc2324-data/\"+i\n",
" liste_base = []\n",
" for base in fs.ls(BUCKET):\n",
" match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
" if match:\n",
" nom_base = match.group(3)\n",
" liste_base.append(nom_base)\n",
" globals()['base_'+i] = liste_base\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abd477e1-7479-4c88-a5aa-f987af3f5b79",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a",
"metadata": {},
"outputs": [],
"source": [
"len(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d058b21-a538-4f59-aefb-ef7966f73fdc",
"metadata": {},
"outputs": [],
"source": [
"df1_tags = load_dataset_2(\"1\", \"tags\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa441f99-733c-4675-8676-bed4682d3324",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6767a750-14a4-4c05-903e-d2f07170825b",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus = load_dataset_2(\"1\", \"customersplus\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "125e9145-a815-46fd-bdf4-07589508b259",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c17a6976-792f-474d-bcff-c89396eddb3f",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecfc155a-cb42-46ec-8da5-33fdcd087355",
"metadata": {},
"outputs": [],
"source": [
"len(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "071410b8-950d-4fcc-b2b9-57415253c286",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc",
"metadata": {},
"outputs": [],
"source": [
"?np.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_tags)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980",
"metadata": {},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n",
"\n",
"print(df1_tags['name'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a59bf932-5b54-4600-81f5-c55ac93ae510",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(df1_tags['name'])"
]
},
{
"cell_type": "markdown",
"id": "76bffba1-5f7e-4308-9224-437ca66148f8",
"metadata": {},
"source": [
"## KPI sur target_type"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6daf22e-6583-4431-a467-660a1dd4e5a4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
2024-03-23 12:51:18 +01:00
"execution_count": null,
2024-03-13 23:24:38 +01:00
"id": "d91d5895",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n"
]
},
{
"cell_type": "markdown",
"id": "c58b17d3",
"metadata": {},
"source": [
"Raisonnement : on prends les target_type qui représente 90% des clients et on fait des catégories dessus."
]
},
{
"cell_type": "code",
2024-03-23 12:51:18 +01:00
"execution_count": null,
2024-03-13 23:24:38 +01:00
"id": "6930bff5",
"metadata": {},
"outputs": [],
"source": [
"def print_main_target(tenant_id, nb_print = 40):\n",
2024-03-23 10:48:47 +01:00
" df_target = display_input_databases(tenant_id, \"target_information\")\n",
2024-03-13 23:24:38 +01:00
"\n",
" print('Nombre de ciblage : ', len(df_target))\n",
" nb_customers = df_target['customer_id'].nunique()\n",
" print('Nombre de client avec étiquette target : ', nb_customers) \n",
"\n",
" nb_custumers_per_target = df_target.groupby(\"target_name\")['customer_id'].count().reset_index().sort_values('customer_id', ascending=False)\n",
" nb_custumers_per_target['cumulative_customers'] = nb_custumers_per_target['customer_id'].cumsum()/len(df_target)\n",
" nb_custumers_per_target['customer_id'] = nb_custumers_per_target['customer_id']/nb_customers\n",
"\n",
" return nb_custumers_per_target.head(nb_print)"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-03-23 12:51:18 +01:00
"id": "1e7ee1a0",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"pd.set_option(\"max_colwidth\", None)\n",
"print_main_target('1', 60)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "19f3a2dd-ba3d-4dec-8e10-fed544ab6a53",
"metadata": {},
"outputs": [],
"source": [
"pd.reset_option('display.max_rows')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b57a28ac",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('2', 25)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a65991f",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('3', 70)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f34b8bf",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print_main_target('4', 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52b24d66-92ad-4421-a62b-5cba837f1893",
2024-03-13 23:24:38 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
2024-03-23 12:51:18 +01:00
"execution_count": null,
"id": "40fe3676",
2024-03-23 10:48:47 +01:00
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
2024-03-23 12:51:18 +01:00
"\n",
"\n",
"print_main_target('5', 100)"
2024-03-23 10:48:47 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-23 12:51:18 +01:00
"id": "820d3600-379b-4245-a977-f1f1fa1f1839",
"metadata": {
"scrolled": true
},
2024-03-23 10:48:47 +01:00
"outputs": [],
"source": [
2024-03-23 12:51:18 +01:00
"print_main_target('6', 100)"
2024-03-23 10:48:47 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-23 12:51:18 +01:00
"id": "86f64a1b-763a-4e43-9601-a38c80392d47",
2024-03-23 10:48:47 +01:00
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
2024-03-23 12:51:18 +01:00
"print_main_target('7', 100)"
2024-03-23 10:48:47 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-23 12:51:18 +01:00
"id": "fbf2ea42-515a-4cdf-a4c1-50f99c379ed9",
2024-03-23 10:48:47 +01:00
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
2024-03-23 12:51:18 +01:00
"print_main_target('8', 100)"
2024-03-23 10:48:47 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-23 12:51:18 +01:00
"id": "9684045c-4e25-4952-b099-a559baa5d749",
2024-03-23 10:48:47 +01:00
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
2024-03-23 12:51:18 +01:00
"print_main_target('9', 100)"
2024-03-23 10:48:47 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-23 12:51:18 +01:00
"id": "cf8f7816-e7f3-4b7a-a987-8350a76eb140",
2024-03-23 10:48:47 +01:00
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
2024-03-23 12:51:18 +01:00
"print_main_target('10', 100)"
2024-03-23 10:48:47 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-23 12:51:18 +01:00
"id": "76c818a5-3c52-4d97-ac81-b7f3f89092bd",
2024-03-23 10:48:47 +01:00
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
2024-03-23 12:51:18 +01:00
"print_main_target('11', 100)\n"
2024-03-23 10:48:47 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-23 12:51:18 +01:00
"id": "603b11e4-5d76-4699-a1b2-e795929edc04",
2024-03-23 10:48:47 +01:00
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
2024-03-23 12:51:18 +01:00
"print_main_target('12', 100)\n"
2024-03-23 10:48:47 +01:00
]
},
{
"cell_type": "code",
"execution_count": null,
2024-03-23 12:51:18 +01:00
"id": "fa93aecd-d117-481e-8507-15e49937ce14",
"metadata": {
"scrolled": true
},
2024-03-23 10:48:47 +01:00
"outputs": [],
"source": [
2024-03-23 12:51:18 +01:00
"print_main_target('13', 100)\n"
2024-03-23 10:48:47 +01:00
]
},
{
"cell_type": "code",
2024-03-23 12:51:18 +01:00
"execution_count": null,
"id": "a115ebcf-4488-47f3-9d7e-75a1fca52f0f",
2024-03-23 10:48:47 +01:00
"metadata": {
"scrolled": true
},
2024-03-23 12:51:18 +01:00
"outputs": [],
2024-03-23 10:48:47 +01:00
"source": [
"print_main_target('14', 100)\n"
]
},
{
"cell_type": "markdown",
"id": "605cced5-052f-4a99-ac26-020c5d2ab633",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## KPI sur tags"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "916c3e2b-04d3-4877-b894-8f26f10d926e",
"metadata": {},
"outputs": [],
"source": [
"customersplus = load_dataset_2(\"4\", \"customersplus\")[['id', 'structure_id']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46847b24-15a4-464e-969f-f16ed3653f1f",
"metadata": {},
"outputs": [],
"source": [
"structure_tag_mappings = load_dataset_2('4', \"structure_tag_mappings\")[['structure_id', 'tag_id']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c10c69d-735f-453e-96bf-750697d965d0",
"metadata": {},
"outputs": [],
"source": [
"customersplus[customersplus['structure_id'].notna()]['structure_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b0e77b3-5f16-4484-9564-7d3826583418",
"metadata": {},
"outputs": [],
"source": [
"len(customersplus[customersplus['structure_id'].notna()])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfa27722-37f9-435a-8221-8aa6f9a4a107",
"metadata": {},
"outputs": [],
"source": [
"structure_tag_mappings['structure_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2daabdd5-31e3-4918-9856-9bbc30cde602",
"metadata": {},
"outputs": [],
"source": [
"def tags_information(tenant_id, first_tags):\n",
"\n",
" customersplus = load_dataset_2(tenant_id, \"customersplus\")[['id', 'structure_id']]\n",
" customersplus.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
" tags = load_dataset_2(tenant_id, \"tags\")[['id', 'name']]\n",
" tags.rename(columns = {'id' : 'tag_id', 'name' : 'tag_name'}, inplace = True)\n",
" structure_tag_mappings = load_dataset_2(tenant_id, \"structure_tag_mappings\")[['structure_id', 'tag_id']]\n",
" \n",
" customer_tags = pd.merge(customersplus, structure_tag_mappings, on = 'structure_id', how = 'left')\n",
" customer_tags = pd.merge(customer_tags, tags, on = 'tag_id', how = 'inner')\n",
" \n",
" nb_customers_with_tag = customer_tags['customer_id'].nunique()\n",
" \n",
" print('Nombre de client avec tag : ', nb_customers_with_tag)\n",
" print('Proportion de clients avec tags : ', nb_customers_with_tag/len(customersplus))\n",
" print('Moyenne de tags par client : ', len(customer_tags)/nb_customers_with_tag)\n",
" \n",
" info = customer_tags.groupby(['tag_id', 'tag_name'])['customer_id'].count().reset_index().sort_values('customer_id', ascending = False).head(first_tags)\n",
"\n",
" return info"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b9f5f71-a927-4cc8-bb0c-9538e28d3553",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"1\", 20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd5bef41-1774-4601-86b5-b7c1aea8f1d2",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"2\", 20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c2dc3e6-1418-44db-a8c0-4a9d59ec5232",
"metadata": {},
"outputs": [],
"source": [
"load_dataset_2(\"2\", \"tags\")[['id', 'name']]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c7b2c670-7122-4f67-b1aa-8c80a10f16d8",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"3\", 20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76639995-252d-4a58-83d8-c0c00900c3a9",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"4\", 20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07e91791-d4d4-42b1-ac18-22d3b0b9f7bd",
"metadata": {},
"outputs": [],
"source": [
"tags_information(\"101\", 20)"
]
},
{
"cell_type": "markdown",
"id": "87d131cd-ead0-4ef4-a8ee-b09022d08ffa",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## KPI product"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26582be9-cfd1-48ea-a0a7-31101fdeb9d1",
"metadata": {},
"outputs": [],
"source": [
"tenant_id = \"1\"\n",
"\n",
"df_product = display_databases(tenant_id, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
"\n",
"df_product.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "533bf499-dd56-4d29-b261-ca1e4928c9c7",
"metadata": {},
"outputs": [],
2024-02-28 21:57:28 +01:00
"source": [
"nb_tickets_per_events = df_product.groupby(['name_event_types', 'name_events'])['ticket_id'].count().reset_index().sort_values('ticket_id', ascending = False)\n",
"nb_tickets_per_events['prop_tickets'] = round(nb_tickets_per_events['ticket_id']/len(df_product), 3)\n",
"nb_tickets_per_events"
]
},
2024-02-25 18:33:24 +01:00
{
"cell_type": "markdown",
"id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901",
2024-03-13 23:24:38 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-02-25 18:33:24 +01:00
"source": [
"# Fin travail 25/02"
]
},
{
"cell_type": "markdown",
"id": "c437eaec",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 10:38:10 +01:00
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "a1c1fc39",
2024-01-13 10:38:10 +01:00
"metadata": {},
"source": [
"## Chargement données"
2024-01-10 19:19:51 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "66f8c17b",
2024-01-10 19:19:51 +01:00
"metadata": {},
2024-01-13 10:38:10 +01:00
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "c08e6798",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
2024-01-13 14:14:11 +01:00
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
2024-01-13 10:38:10 +01:00
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_database_filtered)"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "675f518d",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
2024-02-04 16:02:01 +01:00
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
2024-01-13 10:38:10 +01:00
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
2024-02-05 22:04:02 +01:00
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "e855f403",
2024-02-19 23:11:28 +01:00
"metadata": {},
2024-02-05 22:04:02 +01:00
"source": [
"## customersplus.csv"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "91a8f8c4",
2024-02-05 22:04:02 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-05 22:04:02 +01:00
"source": [
"a = pd.DataFrame(df1_customersplus.info())"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "2fda171d",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "205eeeab",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "634282c5",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"a = info_colonnes_dataframe(df1_customersplus)"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "0e8d4133",
2024-02-05 22:04:02 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-05 22:04:02 +01:00
"source": [
"a"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "1268ad5a",
2024-02-05 22:04:02 +01:00
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "bd41dc80",
2024-02-05 22:04:02 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-05 22:04:02 +01:00
"source": [
"# Selection des variables\n",
"df1_customersplus_clean = df1_customersplus.copy()\n",
"\n",
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
"\n",
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n"
]
},
2024-01-13 14:47:24 +01:00
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "64d0f76b",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-01-13 14:47:24 +01:00
"source": [
"## tickets.csv"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "7e683711",
2024-01-13 14:47:24 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 14:47:24 +01:00
"source": [
"df1_tickets"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "e7b9a52e",
2024-01-13 14:47:24 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 14:47:24 +01:00
"source": [
"df1_tickets.info()"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "568280e8",
2024-01-13 14:47:24 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 14:47:24 +01:00
"source": [
"df1_tickets.isna().sum()/len(df1_tickets)*100"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "29ecec90",
2024-01-13 14:47:24 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 14:47:24 +01:00
"source": [
"# Selection des variables\n",
2024-02-05 22:04:02 +01:00
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
2024-01-13 14:47:24 +01:00
]
},
2024-01-13 10:38:10 +01:00
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "22bb5de4",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-01-13 10:38:10 +01:00
"source": [
"## suppliers.csv"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "6a9a91f4",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
"df1_suppliers"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "bab4758a",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 10:38:10 +01:00
"source": [
"df1_suppliers.info()"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "b5fff251",
2024-01-13 10:38:10 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
2024-01-13 10:38:10 +01:00
]
},
2024-01-13 14:14:11 +01:00
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "8b09e2a3",
2024-01-13 14:14:11 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"# Selection des variables\n",
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
2024-01-13 10:38:10 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "ecee7cdc",
2024-01-10 19:19:51 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"df1_suppliers_clean"
2024-01-10 19:19:51 +01:00
]
},
{
2024-01-13 14:47:24 +01:00
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "c8e6e69b",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-01-13 14:47:24 +01:00
"source": [
"## type_ofs.csv"
2024-01-10 19:19:51 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "1a6cff1f",
2024-01-10 19:19:51 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"df1_type_ofs"
2024-01-10 19:19:51 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "93630b41",
2024-01-10 19:19:51 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"df1_type_ofs.info()"
2024-01-10 19:19:51 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "4f94481a",
2024-01-13 14:47:24 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-13 14:47:24 +01:00
"source": [
"# Selection des variables\n",
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "1b2811e2",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-01-13 14:47:24 +01:00
"source": [
"## purchases.csv"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "2455d2e1",
2024-01-13 14:47:24 +01:00
"metadata": {
"scrolled": true
},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"df1_purchases"
2024-01-10 19:19:51 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "5f9a159d",
2024-01-10 19:19:51 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"df1_purchases.info()"
2024-01-10 19:19:51 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "db201bf7",
2024-01-10 19:19:51 +01:00
"metadata": {},
2024-01-13 14:47:24 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"# Nettoyage purchase_date\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
2024-01-10 19:19:51 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "bd436fca",
2024-01-10 19:19:51 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"df1_purchases.info()"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "83435862",
2024-01-13 14:47:24 +01:00
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "f210e730",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-01-13 14:47:24 +01:00
"source": [
"## Fusion de l'ensemble des données billétiques"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "1f8b3aa7",
2024-01-13 14:47:24 +01:00
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec fournisseurs\n",
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
2024-01-10 19:19:51 +01:00
"\n",
2024-01-13 14:47:24 +01:00
"# Fusion avec type de tickets\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
2024-01-10 19:19:51 +01:00
"\n",
2024-01-13 14:47:24 +01:00
"# Fusion avec achats\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
2024-01-10 19:19:51 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "83a4d021",
2024-01-14 17:38:16 +01:00
"metadata": {
"scrolled": true
},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-14 17:38:16 +01:00
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "56e6ebd1",
2024-02-10 13:23:44 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-01-14 17:38:16 +01:00
"source": [
"# Utilisation de fonctions"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "88fcde4b",
2024-01-10 19:19:51 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-14 17:38:16 +01:00
"source": [
2024-02-04 16:02:01 +01:00
"# Créer un DataFrame exemple\n",
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
"\n",
"# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
"df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
"df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
"\n",
"test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
"\n",
"test.info()"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "818f69db",
2024-02-04 16:02:01 +01:00
"metadata": {},
"source": [
"## Nettoyage, selection et fusion"
2024-01-14 17:38:16 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "c9654eda",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-01-10 19:19:51 +01:00
"source": [
2024-01-13 14:47:24 +01:00
"df1_ticket_information"
2024-01-10 19:19:51 +01:00
]
2024-01-13 14:47:24 +01:00
},
2024-02-04 16:02:01 +01:00
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "7f2b620c",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_ticket_information.info()"
]
},
2024-01-14 17:38:16 +01:00
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "637bdb72",
2024-01-14 17:38:16 +01:00
"metadata": {},
"source": [
2024-02-04 16:02:01 +01:00
"# Customer information"
2024-01-14 17:38:16 +01:00
]
},
{
2024-02-04 16:02:01 +01:00
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "14c52894",
2024-03-13 23:24:38 +01:00
"metadata": {},
2024-01-14 17:38:16 +01:00
"source": [
2024-02-04 16:02:01 +01:00
"## Target area"
2024-01-14 17:38:16 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "d83abfbf",
2024-01-14 17:38:16 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-01-14 17:38:16 +01:00
"source": [
2024-02-04 16:02:01 +01:00
"# Target.csv cleaning\n",
"df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
"df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
"\n",
"# target_type cleaning\n",
"df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
"\n",
"#customer_target_mappings cleaning\n",
"df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
"\n",
"# Merge target et target_type\n",
"df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
"df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
"\n",
"# Merge\n",
"df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
"df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
2024-01-14 17:38:16 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "90d71b2c",
2024-01-14 17:38:16 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
"\n",
"# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
"df1_targets_test.mean()\n"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "2301de1e",
2024-02-07 23:28:55 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
2024-02-07 23:28:55 +01:00
"df1_targets_full.head()"
2024-02-04 16:02:01 +01:00
]
},
{
2024-02-07 23:28:55 +01:00
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "75fbc2f7",
2024-02-07 23:28:55 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
2024-02-07 23:28:55 +01:00
"# Catégorisation des target_name\n",
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.probability import FreqDist\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "55cddf92",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
2024-02-07 23:28:55 +01:00
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
2024-02-04 16:02:01 +01:00
"\n",
"\n",
2024-02-07 23:28:55 +01:00
"# Appliquer le prétraitement à la colonne de texte\n",
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
"\n",
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
"\n",
"# Calculer la fréquence des mots\n",
"freq_dist = FreqDist(all_words)\n",
"\n",
"\n"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "7fd98a85",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
2024-02-07 23:28:55 +01:00
"# Affichage des mots les plus fréquents\n",
"print(\"Mots les plus fréquents:\")\n",
"for mot, freq in freq_dist.most_common(15):\n",
" print(f\"{mot}: {freq}\")"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "cf94bb1d",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-02-07 23:28:55 +01:00
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"# Création de la DataFrame d'exemple\n",
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Fonction pour prétraiter le texte\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
2024-02-10 22:46:56 +01:00
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
"\n",
"# Afficher le résultat\n",
"print(df)\n"
]
},
{
"cell_type": "markdown",
"id": "711d3884",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Campaign area"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "c25b5295",
"metadata": {},
"outputs": [],
"source": [
"# campaign_stats cleaning \n",
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
"cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
"\n",
"# campaigns cleaning\n",
"df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
"cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
"\n",
"# Merge \n",
"df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
"df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "2a3de6a5",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns_full.info()"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "3fc1f446",
"metadata": {},
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "20e69ee3",
2024-02-04 16:02:01 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Link area"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "d9cbdbce",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_campaigns"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "c07459f0",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-02-10 22:46:56 +01:00
"outputs": [],
2024-02-04 16:02:01 +01:00
"source": [
"df1_link_stats"
]
},
{
"cell_type": "markdown",
2024-02-10 22:46:56 +01:00
"id": "80ae4c42",
2024-03-04 23:30:25 +01:00
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
2024-02-04 16:02:01 +01:00
"source": [
2024-03-04 23:30:25 +01:00
"## Supplier"
2024-02-04 16:02:01 +01:00
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "b50b8f95",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [],
"source": [
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
"def suppliers_exploration(suppliers = None) : \n",
" \n",
" # Taux de NaN pour ces colonnes\n",
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
"\n",
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
" 'label_na' : [label_na],\n",
" 'itr_na' : [itr_na],\n",
" 'commission_na' : [commission_na]})\n",
"\n",
" return suppliers_desc"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "7e292935",
2024-02-04 16:02:01 +01:00
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "05b6f2b0",
2024-02-04 16:02:01 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-01-14 17:38:16 +01:00
"source": [
"df1_suppliers_desc"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "c9324d80",
2024-01-14 17:38:16 +01:00
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"liste_folders = fs.ls(BUCKET)\n",
"\n",
"liste_files = []\n",
"for company_folder in liste_folders : \n",
" liste_files.extend(fs.ls(company_folder))"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "10304058",
2024-01-14 17:38:16 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-01-14 17:38:16 +01:00
"source": [
"liste_database_select = ['suppliers']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_suppliers)"
]
},
2024-01-13 14:47:24 +01:00
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "ffa423e5",
2024-01-13 14:47:24 +01:00
"metadata": {},
"outputs": [],
2024-01-14 17:38:16 +01:00
"source": [
"# loop to create dataframes from file 2\n",
"def database_loading(database_name = None):\n",
" files_path = database_name\n",
" \n",
2024-02-10 13:23:44 +01:00
" client_number = files_path.split(\"/\")[1]\n",
2024-01-14 17:38:16 +01:00
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
" \n",
2024-02-10 13:23:44 +01:00
" current_path = files_path\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
"\n",
" return df, client_number"
]
},
{
"cell_type": "code",
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "70bdc88d",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "6a0f567d",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": [
"df_all = pd.DataFrame()\n",
"\n",
"for link in liste_suppliers:\n",
" \n",
" df_supplier, tenant_id = database_loading(link)\n",
" \n",
" df_supplier['tenant_id'] = int(tenant_id)\n",
2024-01-14 17:38:16 +01:00
"\n",
2024-02-10 13:23:44 +01:00
" df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
2024-01-14 17:38:16 +01:00
" "
]
2024-02-10 13:23:44 +01:00
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "1522d8cd",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": [
"# df_all[df_all['tenant_id'] == 101]['name'].unique()"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "b0e42a61",
2024-02-10 13:23:44 +01:00
"metadata": {},
"outputs": [],
"source": [
"liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
"# vad = vente à distance\n",
"df_all['name'] = df_all['name'].fillna('')\n",
"\n",
"df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
]
},
{
"cell_type": "code",
2024-03-23 10:48:47 +01:00
"execution_count": null,
2024-02-10 22:46:56 +01:00
"id": "d299ae91",
2024-02-10 13:23:44 +01:00
"metadata": {},
2024-03-23 10:48:47 +01:00
"outputs": [],
2024-02-10 13:23:44 +01:00
"source": [
"df_all.groupby('tenant_id')['canal_vente_internet'].max()"
]
2024-01-10 19:19:51 +01:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2024-02-19 23:11:28 +01:00
"version": "3.11.6"
2024-01-10 19:19:51 +01:00
}
},
"nbformat": 4,
"nbformat_minor": 5
}