BDC-team-1/Exploration_billet_AJ.ipynb
2024-03-14 22:35:25 +00:00

6593 lines
224 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "5bf5c226",
"metadata": {},
"source": [
"# Business Data Challenge - Team 1"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b1a5b9d3",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "ecfa2219",
"metadata": {},
"source": [
"Configuration de l'accès aux données"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1a094277",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "30d77451-2df6-4c07-8b15-66e0e990ff03",
"metadata": {},
"outputs": [],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"\n",
"# Import cleaning and merge functions\n",
"\n",
"exec(open('0_Cleaning_and_merge_functions.py').read())\n",
"\n",
"exec(open('0_KPI_functions.py').read())\n",
"\n",
"# Ignore warning\n",
"warnings.filterwarnings('ignore')\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39",
"metadata": {},
"outputs": [],
"source": [
"def load_dataset_2(directory_path, file_name):\n",
" \"\"\"\n",
" This function loads csv file\n",
" \"\"\"\n",
" file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\")\n",
"\n",
" # drop na :\n",
" #df = df.dropna(axis=1, thresh=len(df))\n",
" # if identifier in table : delete it\n",
" if 'identifier' in df.columns:\n",
" df = df.drop(columns = 'identifier')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "31ab76f0-fbb1-46f6-b359-97228620c207",
"metadata": {},
"outputs": [],
"source": [
"def export_in_temporary(df, output_name):\n",
" print('Export of dataset :', output_name)\n",
" FILE_PATH_OUT_S3 = \"projet-bdc2324-team1/Temporary\" + \"/\" + output_name + '.csv'\n",
" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n",
" df.to_csv(file_out, index = False)"
]
},
{
"cell_type": "markdown",
"id": "ccf597b0-b459-4ea5-baf0-5ba8c90915e4",
"metadata": {},
"source": [
"# Cleaning target area and tags"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "28316e1d-7892-4506-9d53-0695e71aa7bc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : bdc2324-data/1/1targets.csv\n",
"Shape : (287, 5)\n",
"Number of columns : 3\n",
"Columns : Index(['id', 'target_type_id', 'name'], dtype='object')\n",
"File path : bdc2324-data/1/1target_types.csv\n",
"Shape : (4, 6)\n",
"Number of columns : 4\n",
"Columns : Index(['id', 'identifier', 'is_import', 'name'], dtype='object')\n",
"File path : bdc2324-data/1/1customer_target_mappings.csv\n",
"Shape : (768024, 7)\n",
"Number of columns : 5\n",
"Columns : Index(['id', 'customer_id', 'target_id', 'name', 'extra_field'], dtype='object')\n"
]
}
],
"source": [
"target_example = preprocessing_target_area('1')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "da467695-ce37-485d-94ab-f1499d56c3a3",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_name</th>\n",
" <th>target_type_is_import</th>\n",
" <th>target_type_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1184824</td>\n",
" <td>645400</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1184825</td>\n",
" <td>645400</td>\n",
" <td>Inscrits NL générale site web</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1184828</td>\n",
" <td>645402</td>\n",
" <td>DDCP PROMO Art contemporain</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1184829</td>\n",
" <td>645403</td>\n",
" <td>DDCP PROMO Art contemporain</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1295770</td>\n",
" <td>647301</td>\n",
" <td>Votre première liste</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>768019</th>\n",
" <td>2737545</td>\n",
" <td>666983</td>\n",
" <td>Inscrits NL générale site web</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>768020</th>\n",
" <td>2737546</td>\n",
" <td>666983</td>\n",
" <td>Votre première liste</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>768021</th>\n",
" <td>2737575</td>\n",
" <td>666986</td>\n",
" <td>Votre première liste</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>768022</th>\n",
" <td>2737576</td>\n",
" <td>666987</td>\n",
" <td>Inscrits NL générale site web</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>768023</th>\n",
" <td>2737577</td>\n",
" <td>666987</td>\n",
" <td>Votre première liste</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>768024 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" id customer_id target_name \\\n",
"0 1184824 645400 DDCP PROMO Réseau livres \n",
"1 1184825 645400 Inscrits NL générale site web \n",
"2 1184828 645402 DDCP PROMO Art contemporain \n",
"3 1184829 645403 DDCP PROMO Art contemporain \n",
"4 1295770 647301 Votre première liste \n",
"... ... ... ... \n",
"768019 2737545 666983 Inscrits NL générale site web \n",
"768020 2737546 666983 Votre première liste \n",
"768021 2737575 666986 Votre première liste \n",
"768022 2737576 666987 Inscrits NL générale site web \n",
"768023 2737577 666987 Votre première liste \n",
"\n",
" target_type_is_import target_type_name \n",
"0 False manual_static_filter \n",
"1 False manual_static_filter \n",
"2 False manual_static_filter \n",
"3 False manual_static_filter \n",
"4 False manual_static_filter \n",
"... ... ... \n",
"768019 False manual_static_filter \n",
"768020 False manual_static_filter \n",
"768021 False manual_static_filter \n",
"768022 False manual_static_filter \n",
"768023 False manual_static_filter \n",
"\n",
"[768024 rows x 5 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target_example"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "fd88e294-e038-4cec-ad94-2bbbc10a4059",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_1/target_information.csv\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>nb_targets</th>\n",
" <th>all_targets</th>\n",
" <th>all_target_types</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>28</td>\n",
" <td>consentement optin jeune public, DDCP rentrée ...</td>\n",
" <td>manual_static_filter, manual_static_filter, ma...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>7</td>\n",
" <td>consentement optin jeune public, consentement ...</td>\n",
" <td>manual_static_filter, manual_static_filter, ma...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>11</td>\n",
" <td>traversee du port de commerce (gagnant et perd...</td>\n",
" <td>manual_static_filter, manual_static_filter, ma...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>Arenametrix_bascule tel vers sib, consentement...</td>\n",
" <td>manual_static_filter, manual_static_filter, ma...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>4</td>\n",
" <td>Arenametrix_bascule tel vers sib, consentement...</td>\n",
" <td>manual_static_filter, manual_static_filter, ma...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151154</th>\n",
" <td>1256136</td>\n",
" <td>1</td>\n",
" <td>consentement optin b2c</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151155</th>\n",
" <td>1256137</td>\n",
" <td>1</td>\n",
" <td>consentement optin b2c</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151156</th>\n",
" <td>1256138</td>\n",
" <td>3</td>\n",
" <td>Inscrits NL jeune public site web, Inscrits NL...</td>\n",
" <td>manual_static_filter, manual_static_filter, ma...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151157</th>\n",
" <td>1256139</td>\n",
" <td>3</td>\n",
" <td>Inscrits NL jeune public site web, Inscrits NL...</td>\n",
" <td>manual_static_filter, manual_static_filter, ma...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151158</th>\n",
" <td>1256140</td>\n",
" <td>2</td>\n",
" <td>DRE MucemLab, consentement optin dre</td>\n",
" <td>manual_static_filter, manual_static_filter</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>151159 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id nb_targets \\\n",
"0 1 28 \n",
"1 2 7 \n",
"2 3 11 \n",
"3 4 6 \n",
"4 5 4 \n",
"... ... ... \n",
"151154 1256136 1 \n",
"151155 1256137 1 \n",
"151156 1256138 3 \n",
"151157 1256139 3 \n",
"151158 1256140 2 \n",
"\n",
" all_targets \\\n",
"0 consentement optin jeune public, DDCP rentrée ... \n",
"1 consentement optin jeune public, consentement ... \n",
"2 traversee du port de commerce (gagnant et perd... \n",
"3 Arenametrix_bascule tel vers sib, consentement... \n",
"4 Arenametrix_bascule tel vers sib, consentement... \n",
"... ... \n",
"151154 consentement optin b2c \n",
"151155 consentement optin b2c \n",
"151156 Inscrits NL jeune public site web, Inscrits NL... \n",
"151157 Inscrits NL jeune public site web, Inscrits NL... \n",
"151158 DRE MucemLab, consentement optin dre \n",
"\n",
" all_target_types \n",
"0 manual_static_filter, manual_static_filter, ma... \n",
"1 manual_static_filter, manual_static_filter, ma... \n",
"2 manual_static_filter, manual_static_filter, ma... \n",
"3 manual_static_filter, manual_static_filter, ma... \n",
"4 manual_static_filter, manual_static_filter, ma... \n",
"... ... \n",
"151154 manual_static_filter \n",
"151155 manual_static_filter \n",
"151156 manual_static_filter, manual_static_filter, ma... \n",
"151157 manual_static_filter, manual_static_filter, ma... \n",
"151158 manual_static_filter, manual_static_filter \n",
"\n",
"[151159 rows x 4 columns]"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tenant_id = '1'\n",
"\n",
"def concatenate_names(names):\n",
" return ', '.join(names)\n",
" \n",
"target_example =display_databases(tenant_id, \"target_information\")\n",
"\n",
"target_example['target_name'] = target_example['target_name'].fillna('').str.lower()\n",
"\n",
"\n",
"target_example['jeune'] = target_example['target_name'].str.contains('|'.join(['jeune', 'pass_culture']), case=False).astype(int)\n",
"target_example['optin'] = target_example['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)\n",
"target_example['optout'] = target_example['target_name'].str.contains('|'.join(['optout']), case=False).astype(int)\n",
"target_example['scolaire'] = target_example['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)\n",
"target_example['entreprise'] = target_example['target_name'].str.contains('|'.join(['b2b']), case=False).astype(int)\n",
"target_example['famille'] = target_example['target_name'].str.contains('|'.join(['famille', 'enfants']), case=False).astype(int)\n",
"target_example['newsletter'] = target_example['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)\n",
"\n",
"\n",
"\n",
"target_agg = target_example.groupby('customer_id').agg(\n",
" nb_targets=('target_name', 'nunique'), # Utilisation de tuples pour spécifier les noms de colonnes\n",
" all_targets=('target_name', concatenate_names),\n",
" all_target_types=('target_type_name', concatenate_names)\n",
").reset_index()\n",
"target_agg"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "c75efea3-b5e8-4a7a-bed4-dd64ae9ff9f2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Export of dataset : Target_kpi_concatenate\n"
]
}
],
"source": [
"export_in_temporary(target_agg, 'Target_kpi_concatenate')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb6f06e6-78de-4b8d-a103-8366eff0493a",
"metadata": {},
"outputs": [],
"source": [
"v"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c5e864b1-adad-4267-b956-3f7ef371d677",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def display_covering_time(df, company, datecover):\n",
" \"\"\"\n",
" This function draws the time coverage of each company\n",
" \"\"\"\n",
" min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n",
" max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n",
" datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n",
" print(f'Couverture Company {company} : {min_date} - {max_date}')\n",
" return datecover\n",
"\n",
"\n",
"def compute_time_intersection(datecover):\n",
" \"\"\"\n",
" This function returns the time coverage for all companies\n",
" \"\"\"\n",
" timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n",
" intersection = set.intersection(*timestamps_sets)\n",
" intersection_list = list(intersection)\n",
" formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n",
" return sorted(formated_dates)\n",
"\n",
"\n",
"def df_coverage_modelization(sport, coverage_features = 0.7):\n",
" \"\"\"\n",
" This function returns start_date, end_of_features and final dates\n",
" that help to construct train and test datasets\n",
" \"\"\"\n",
" datecover = {}\n",
" for company in sport:\n",
" df_products_purchased_reduced = display_input_databases(company, file_name = \"products_purchased_reduced\",\n",
" datetime_col = ['purchase_date'])\n",
" datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n",
" #print(datecover.keys())\n",
" dt_coverage = compute_time_intersection(datecover)\n",
" start_date = dt_coverage[0]\n",
" end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n",
" final_date = dt_coverage[-1]\n",
" return start_date, end_of_features, final_date\n",
" "
]
},
{
"cell_type": "markdown",
"id": "2435097a-95a5-43e1-84d0-7f6b701441ba",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Bases non communes : mise à plat"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384",
"metadata": {},
"outputs": [],
"source": [
"companies = {'musee' : ['1', '2', '3', '4'],\n",
" 'sport': ['5', '6', '7', '8', '9'],\n",
" 'musique' : ['10', '11', '12', '13', '14']}\n",
"\n",
"all_companies = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "35ac004f-c191-4f45-a4b1-6d993d9ec38c",
"metadata": {},
"outputs": [],
"source": [
"companies_databases = pd.DataFrame()\n",
"\n",
"for i in all_companies:\n",
" company_databases = pd.DataFrame({'company_number' : [i]})\n",
"\n",
" BUCKET = \"bdc2324-data/\"+i\n",
" for base in fs.ls(BUCKET):\n",
" match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
" if match:\n",
" nom_base = match.group(3)\n",
" company_databases[nom_base] = 1\n",
"\n",
" companies_databases = pd.concat([companies_databases, company_databases])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "8986e477-e6c5-4d6c-83b2-2c90c134b599",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>company_number</th>\n",
" <th>campaign_stats</th>\n",
" <th>campaigns</th>\n",
" <th>categories</th>\n",
" <th>countries</th>\n",
" <th>currencies</th>\n",
" <th>customer_target_mappings</th>\n",
" <th>customersplus</th>\n",
" <th>event_types</th>\n",
" <th>events</th>\n",
" <th>facilities</th>\n",
" <th>link_stats</th>\n",
" <th>pricing_formulas</th>\n",
" <th>product_packs</th>\n",
" <th>products</th>\n",
" <th>products_groups</th>\n",
" <th>purchases</th>\n",
" <th>representation_category_capacities</th>\n",
" <th>representations</th>\n",
" <th>seasons</th>\n",
" <th>structure_tag_mappings</th>\n",
" <th>suppliers</th>\n",
" <th>tags</th>\n",
" <th>target_types</th>\n",
" <th>targets</th>\n",
" <th>tickets</th>\n",
" <th>type_of_categories</th>\n",
" <th>type_of_pricing_formulas</th>\n",
" <th>type_ofs</th>\n",
" <th>contribution_sites</th>\n",
" <th>contributions</th>\n",
" <th>consumptions</th>\n",
" <th>representation_types</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>11</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>12</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>14</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" company_number campaign_stats campaigns categories countries \\\n",
"0 1 1 1 1 1 \n",
"0 2 1 1 1 1 \n",
"0 3 1 1 1 1 \n",
"0 4 1 1 1 1 \n",
"0 5 1 1 1 1 \n",
"0 6 1 1 1 1 \n",
"0 7 1 1 1 1 \n",
"0 8 1 1 1 1 \n",
"0 9 1 1 1 1 \n",
"0 10 1 1 1 1 \n",
"0 11 1 1 1 1 \n",
"0 12 1 1 1 1 \n",
"0 13 1 1 1 1 \n",
"0 14 1 1 1 1 \n",
"\n",
" currencies customer_target_mappings customersplus event_types events \\\n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"\n",
" facilities link_stats pricing_formulas product_packs products \\\n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"0 1 1 1 1 1 \n",
"\n",
" products_groups purchases representation_category_capacities \\\n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"0 1 1 1 \n",
"\n",
" representations seasons structure_tag_mappings suppliers tags \\\n",
"0 1 1 1.0 1 1.0 \n",
"0 1 1 1.0 1 1.0 \n",
"0 1 1 1.0 1 1.0 \n",
"0 1 1 1.0 1 1.0 \n",
"0 1 1 NaN 1 NaN \n",
"0 1 1 1.0 1 1.0 \n",
"0 1 1 1.0 1 1.0 \n",
"0 1 1 NaN 1 NaN \n",
"0 1 1 NaN 1 NaN \n",
"0 1 1 NaN 1 1.0 \n",
"0 1 1 1.0 1 1.0 \n",
"0 1 1 NaN 1 NaN \n",
"0 1 1 1.0 1 1.0 \n",
"0 1 1 NaN 1 NaN \n",
"\n",
" target_types targets tickets type_of_categories \\\n",
"0 1 1 1 1.0 \n",
"0 1 1 1 NaN \n",
"0 1 1 1 NaN \n",
"0 1 1 1 NaN \n",
"0 1 1 1 NaN \n",
"0 1 1 1 NaN \n",
"0 1 1 1 1.0 \n",
"0 1 1 1 1.0 \n",
"0 1 1 1 NaN \n",
"0 1 1 1 NaN \n",
"0 1 1 1 NaN \n",
"0 1 1 1 NaN \n",
"0 1 1 1 NaN \n",
"0 1 1 1 1.0 \n",
"\n",
" type_of_pricing_formulas type_ofs contribution_sites contributions \\\n",
"0 1.0 1.0 NaN NaN \n",
"0 NaN NaN 1.0 1.0 \n",
"0 NaN NaN 1.0 1.0 \n",
"0 1.0 1.0 1.0 1.0 \n",
"0 NaN NaN NaN NaN \n",
"0 1.0 1.0 NaN NaN \n",
"0 1.0 1.0 NaN NaN \n",
"0 1.0 1.0 NaN NaN \n",
"0 NaN NaN NaN NaN \n",
"0 1.0 1.0 NaN NaN \n",
"0 NaN NaN NaN NaN \n",
"0 NaN 1.0 NaN NaN \n",
"0 NaN NaN NaN NaN \n",
"0 1.0 1.0 NaN NaN \n",
"\n",
" consumptions representation_types \n",
"0 NaN NaN \n",
"0 NaN NaN \n",
"0 1.0 NaN \n",
"0 NaN NaN \n",
"0 1.0 NaN \n",
"0 1.0 NaN \n",
"0 1.0 1.0 \n",
"0 NaN NaN \n",
"0 NaN NaN \n",
"0 NaN 1.0 \n",
"0 NaN NaN \n",
"0 1.0 NaN \n",
"0 NaN 1.0 \n",
"0 NaN 1.0 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.set_option(\"display.max_columns\", None)\n",
"companies_databases\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "8fecc3bb-4c03-4144-97c5-615224d9729e",
"metadata": {},
"outputs": [],
"source": [
"pd.reset_option(\"display.max_columns\")"
]
},
{
"cell_type": "markdown",
"id": "0294ce71-840e-458b-8ffa-cadabbc6da21",
"metadata": {},
"source": [
"# Debut Travail 25/02"
]
},
{
"cell_type": "markdown",
"id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Base communes au types Musée"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b",
"metadata": {},
"outputs": [],
"source": [
"companies['musee']"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5080f66e-f779-410a-876d-b4fe2795e17e",
"metadata": {},
"outputs": [],
"source": [
"for i in companies['musique']:\n",
" BUCKET = \"bdc2324-data/\"+i\n",
" liste_base = []\n",
" for base in fs.ls(BUCKET):\n",
" match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
" if match:\n",
" nom_base = match.group(3)\n",
" liste_base.append(nom_base)\n",
" globals()['base_'+i] = liste_base\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abd477e1-7479-4c88-a5aa-f987af3f5b79",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e",
"metadata": {},
"outputs": [],
"source": [
"# Trouver l'intersection entre les cinq listes\n",
"intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n",
"\n",
"# Convertir le résultat en liste si nécessaire\n",
"intersection_liste = list(intersection)\n",
"\n",
"print(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a",
"metadata": {},
"outputs": [],
"source": [
"len(intersection_liste)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d058b21-a538-4f59-aefb-ef7966f73fdc",
"metadata": {},
"outputs": [],
"source": [
"df1_tags = load_dataset_2(\"1\", \"tags\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aa441f99-733c-4675-8676-bed4682d3324",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6767a750-14a4-4c05-903e-d2f07170825b",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus = load_dataset_2(\"1\", \"customersplus\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "125e9145-a815-46fd-bdf4-07589508b259",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c17a6976-792f-474d-bcff-c89396eddb3f",
"metadata": {},
"outputs": [],
"source": [
"df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ecfc155a-cb42-46ec-8da5-33fdcd087355",
"metadata": {},
"outputs": [],
"source": [
"len(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "071410b8-950d-4fcc-b2b9-57415253c286",
"metadata": {},
"outputs": [],
"source": [
"df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc",
"metadata": {},
"outputs": [],
"source": [
"?np.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_tags)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980",
"metadata": {},
"outputs": [],
"source": [
"info_colonnes_dataframe(df1_structure_tag_mappings)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n",
"\n",
"print(df1_tags['name'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a59bf932-5b54-4600-81f5-c55ac93ae510",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"print(df1_tags['name'])"
]
},
{
"cell_type": "markdown",
"id": "76bffba1-5f7e-4308-9224-437ca66148f8",
"metadata": {},
"source": [
"## KPI sur target_type"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6daf22e-6583-4431-a467-660a1dd4e5a4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d91d5895",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_colwidth', None)\n"
]
},
{
"cell_type": "markdown",
"id": "c58b17d3",
"metadata": {},
"source": [
"Raisonnement : on prends les target_type qui représente 90% des clients et on fait des catégories dessus."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d74426b3",
"metadata": {},
"outputs": [],
"source": [
"targets = load_dataset_2(\"3\", \"targets\")\n",
"target_types = load_dataset_2(\"3\", \"target_types\")\n",
"\n",
"# target_all = pd.merge(targets, target_types, left_on= 'target_type_id', right_on= 'id' ,how = 'inner')\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "6930bff5",
"metadata": {},
"outputs": [],
"source": [
"def print_main_target(tenant_id, nb_print = 40):\n",
" df_target = display_databases(tenant_id, \"target_information\")\n",
"\n",
" print('Nombre de ciblage : ', len(df_target))\n",
" nb_customers = df_target['customer_id'].nunique()\n",
" print('Nombre de client avec étiquette target : ', nb_customers) \n",
"\n",
" nb_custumers_per_target = df_target.groupby(\"target_name\")['customer_id'].count().reset_index().sort_values('customer_id', ascending=False)\n",
" nb_custumers_per_target['cumulative_customers'] = nb_custumers_per_target['customer_id'].cumsum()/len(df_target)\n",
" nb_custumers_per_target['customer_id'] = nb_custumers_per_target['customer_id']/nb_customers\n",
"\n",
" return nb_custumers_per_target.head(nb_print)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "1e7ee1a0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_1/target_information.csv\n",
"Nombre de ciblage : 768024\n",
"Nombre de client avec étiquette target : 151159\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>target_name</th>\n",
" <th>customer_id</th>\n",
" <th>cumulative_customers</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>161</th>\n",
" <td>consentement optin mediation specialisee</td>\n",
" <td>0.992333</td>\n",
" <td>0.195306</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>consentement optin jeune public</td>\n",
" <td>0.992194</td>\n",
" <td>0.390585</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158</th>\n",
" <td>consentement optin b2c</td>\n",
" <td>0.720493</td>\n",
" <td>0.532390</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Arenametrix_bascule tel vers sib</td>\n",
" <td>0.232973</td>\n",
" <td>0.578242</td>\n",
" </tr>\n",
" <tr>\n",
" <th>165</th>\n",
" <td>consentement optout b2c</td>\n",
" <td>0.228389</td>\n",
" <td>0.623193</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>COM Inscrits NL générale (historique)</td>\n",
" <td>0.152191</td>\n",
" <td>0.653146</td>\n",
" </tr>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>consentement optin newsletter generale</td>\n",
" <td>0.146171</td>\n",
" <td>0.681915</td>\n",
" </tr>\n",
" <tr>\n",
" <th>169</th>\n",
" <td>consentement optout newsletter generale</td>\n",
" <td>0.124736</td>\n",
" <td>0.706465</td>\n",
" </tr>\n",
" <tr>\n",
" <th>170</th>\n",
" <td>consentement optout scolaires</td>\n",
" <td>0.104155</td>\n",
" <td>0.726964</td>\n",
" </tr>\n",
" <tr>\n",
" <th>166</th>\n",
" <td>consentement optout dre</td>\n",
" <td>0.094788</td>\n",
" <td>0.745620</td>\n",
" </tr>\n",
" <tr>\n",
" <th>164</th>\n",
" <td>consentement optout b2b</td>\n",
" <td>0.094067</td>\n",
" <td>0.764134</td>\n",
" </tr>\n",
" <tr>\n",
" <th>126</th>\n",
" <td>Inscrits NL générale (export_291019 + operation_videomaton)</td>\n",
" <td>0.093187</td>\n",
" <td>0.782474</td>\n",
" </tr>\n",
" <tr>\n",
" <th>157</th>\n",
" <td>consentement optin b2b</td>\n",
" <td>0.084249</td>\n",
" <td>0.799056</td>\n",
" </tr>\n",
" <tr>\n",
" <th>216</th>\n",
" <td>ddcp_visiteurs dps 010622</td>\n",
" <td>0.081735</td>\n",
" <td>0.815142</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Contacts_prenomsdoubles</td>\n",
" <td>0.077025</td>\n",
" <td>0.830302</td>\n",
" </tr>\n",
" <tr>\n",
" <th>115</th>\n",
" <td>FORMATION _ acheteurs optin last year</td>\n",
" <td>0.069364</td>\n",
" <td>0.843954</td>\n",
" </tr>\n",
" <tr>\n",
" <th>214</th>\n",
" <td>ddcp_promo_visiteurs occasionnels_musee_8mois</td>\n",
" <td>0.043927</td>\n",
" <td>0.852600</td>\n",
" </tr>\n",
" <tr>\n",
" <th>189</th>\n",
" <td>ddcp_promo_md_musée_dps 011019</td>\n",
" <td>0.039759</td>\n",
" <td>0.860425</td>\n",
" </tr>\n",
" <tr>\n",
" <th>188</th>\n",
" <td>ddcp_promo_MD_billet_musée_oct_2019_agarder2</td>\n",
" <td>0.036266</td>\n",
" <td>0.867563</td>\n",
" </tr>\n",
" <tr>\n",
" <th>163</th>\n",
" <td>consentement optin scolaires</td>\n",
" <td>0.032079</td>\n",
" <td>0.873876</td>\n",
" </tr>\n",
" <tr>\n",
" <th>159</th>\n",
" <td>consentement optin dre</td>\n",
" <td>0.029949</td>\n",
" <td>0.879771</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>DDCP Newsletter enseignants</td>\n",
" <td>0.029836</td>\n",
" <td>0.885643</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>DDCP Newsletter jeune public</td>\n",
" <td>0.025549</td>\n",
" <td>0.890671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127</th>\n",
" <td>Inscrits NL générale site web</td>\n",
" <td>0.024689</td>\n",
" <td>0.895531</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>Votre première liste</td>\n",
" <td>0.024577</td>\n",
" <td>0.900368</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>DDCP billets famille</td>\n",
" <td>0.023876</td>\n",
" <td>0.905067</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>DRE MucemLab</td>\n",
" <td>0.015229</td>\n",
" <td>0.908064</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>DDCP Newsletter relais champ social</td>\n",
" <td>0.015017</td>\n",
" <td>0.911020</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>DRE institutionnels</td>\n",
" <td>0.014746</td>\n",
" <td>0.913922</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>DDCP PROMO Participants ateliers (adultes et enfants)</td>\n",
" <td>0.012927</td>\n",
" <td>0.916466</td>\n",
" </tr>\n",
" <tr>\n",
" <th>74</th>\n",
" <td>DDCP promo Plan B 2019 (concerts)</td>\n",
" <td>0.012887</td>\n",
" <td>0.919003</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>DDCP promo MD pass musées dps oct 2018</td>\n",
" <td>0.011809</td>\n",
" <td>0.921327</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>DDCP rentrée culturelle 2023</td>\n",
" <td>0.011624</td>\n",
" <td>0.923614</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>DDCP MD Procès du Siècle</td>\n",
" <td>0.011141</td>\n",
" <td>0.925807</td>\n",
" </tr>\n",
" <tr>\n",
" <th>186</th>\n",
" <td>ddcp_md_scene_ouverte_au_talent</td>\n",
" <td>0.010433</td>\n",
" <td>0.927860</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108</th>\n",
" <td>DRE chercheurs</td>\n",
" <td>0.010300</td>\n",
" <td>0.929888</td>\n",
" </tr>\n",
" <tr>\n",
" <th>220</th>\n",
" <td>festival_jean_rouch</td>\n",
" <td>0.009937</td>\n",
" <td>0.931843</td>\n",
" </tr>\n",
" <tr>\n",
" <th>105</th>\n",
" <td>DRE Festival Jean Rouch</td>\n",
" <td>0.009937</td>\n",
" <td>0.933799</td>\n",
" </tr>\n",
" <tr>\n",
" <th>275</th>\n",
" <td>structures_etiquette champ social</td>\n",
" <td>0.009844</td>\n",
" <td>0.935736</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers)</td>\n",
" <td>0.008554</td>\n",
" <td>0.937420</td>\n",
" </tr>\n",
" <tr>\n",
" <th>128</th>\n",
" <td>Inscrits NL jeune public site web</td>\n",
" <td>0.008263</td>\n",
" <td>0.939046</td>\n",
" </tr>\n",
" <tr>\n",
" <th>260</th>\n",
" <td>rappel po barvalo</td>\n",
" <td>0.008256</td>\n",
" <td>0.940671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>DDCP_marseille_jazz_2023</td>\n",
" <td>0.006900</td>\n",
" <td>0.942029</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>DDCP Newsletter centres de loisirs</td>\n",
" <td>0.006827</td>\n",
" <td>0.943373</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Autres_interet_exposition</td>\n",
" <td>0.006754</td>\n",
" <td>0.944702</td>\n",
" </tr>\n",
" <tr>\n",
" <th>228</th>\n",
" <td>import_arenametrix_contactstousecardouv_expo</td>\n",
" <td>0.006212</td>\n",
" <td>0.945925</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>Formation clients fidèles</td>\n",
" <td>0.006047</td>\n",
" <td>0.947115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>DDCP Cine 2023</td>\n",
" <td>0.005656</td>\n",
" <td>0.948228</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>DDCP OLBJ! 2023</td>\n",
" <td>0.005464</td>\n",
" <td>0.949304</td>\n",
" </tr>\n",
" <tr>\n",
" <th>240</th>\n",
" <td>journee-de-l-inclusion_20230601_21h25</td>\n",
" <td>0.005326</td>\n",
" <td>0.950352</td>\n",
" </tr>\n",
" <tr>\n",
" <th>137</th>\n",
" <td>Questionnaire 2 satisfaction scolaire</td>\n",
" <td>0.005259</td>\n",
" <td>0.951387</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>DDCP rendez-vous de septembre offre spéciale</td>\n",
" <td>0.005253</td>\n",
" <td>0.952421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>135</th>\n",
" <td>Plan B 2018 (électro)</td>\n",
" <td>0.005081</td>\n",
" <td>0.953421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>save_the_date_populaire</td>\n",
" <td>0.004948</td>\n",
" <td>0.954395</td>\n",
" </tr>\n",
" <tr>\n",
" <th>132</th>\n",
" <td>Newsletter CCR (passerelle)</td>\n",
" <td>0.004783</td>\n",
" <td>0.955336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>Fichier institutionnel (ne pas utiliser sans autorisation)</td>\n",
" <td>0.004538</td>\n",
" <td>0.956229</td>\n",
" </tr>\n",
" <tr>\n",
" <th>222</th>\n",
" <td>fichier institutionnel_ne_pas_toucher</td>\n",
" <td>0.004532</td>\n",
" <td>0.957121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>266</th>\n",
" <td>reservations_payees_pass_culture_190422_au_310123</td>\n",
" <td>0.004492</td>\n",
" <td>0.958005</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>DDCP spectateurs Marseille Jazz 18-19-21</td>\n",
" <td>0.004432</td>\n",
" <td>0.958878</td>\n",
" </tr>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>acid arab</td>\n",
" <td>0.004413</td>\n",
" <td>0.959746</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" target_name \\\n",
"161 consentement optin mediation specialisee \n",
"160 consentement optin jeune public \n",
"158 consentement optin b2c \n",
"5 Arenametrix_bascule tel vers sib \n",
"165 consentement optout b2c \n",
"19 COM Inscrits NL générale (historique) \n",
"162 consentement optin newsletter generale \n",
"169 consentement optout newsletter generale \n",
"170 consentement optout scolaires \n",
"166 consentement optout dre \n",
"164 consentement optout b2b \n",
"126 Inscrits NL générale (export_291019 + operation_videomaton) \n",
"157 consentement optin b2b \n",
"216 ddcp_visiteurs dps 010622 \n",
"20 Contacts_prenomsdoubles \n",
"115 FORMATION _ acheteurs optin last year \n",
"214 ddcp_promo_visiteurs occasionnels_musee_8mois \n",
"189 ddcp_promo_md_musée_dps 011019 \n",
"188 ddcp_promo_MD_billet_musée_oct_2019_agarder2 \n",
"163 consentement optin scolaires \n",
"159 consentement optin dre \n",
"34 DDCP Newsletter enseignants \n",
"36 DDCP Newsletter jeune public \n",
"127 Inscrits NL générale site web \n",
"145 Votre première liste \n",
"61 DDCP billets famille \n",
"106 DRE MucemLab \n",
"39 DDCP Newsletter relais champ social \n",
"110 DRE institutionnels \n",
"48 DDCP PROMO Participants ateliers (adultes et enfants) \n",
"74 DDCP promo Plan B 2019 (concerts) \n",
"72 DDCP promo MD pass musées dps oct 2018 \n",
"94 DDCP rentrée culturelle 2023 \n",
"23 DDCP MD Procès du Siècle \n",
"186 ddcp_md_scene_ouverte_au_talent \n",
"108 DRE chercheurs \n",
"220 festival_jean_rouch \n",
"105 DRE Festival Jean Rouch \n",
"275 structures_etiquette champ social \n",
"86 DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers) \n",
"128 Inscrits NL jeune public site web \n",
"260 rappel po barvalo \n",
"104 DDCP_marseille_jazz_2023 \n",
"32 DDCP Newsletter centres de loisirs \n",
"13 Autres_interet_exposition \n",
"228 import_arenametrix_contactstousecardouv_expo \n",
"117 Formation clients fidèles \n",
"22 DDCP Cine 2023 \n",
"40 DDCP OLBJ! 2023 \n",
"240 journee-de-l-inclusion_20230601_21h25 \n",
"137 Questionnaire 2 satisfaction scolaire \n",
"93 DDCP rendez-vous de septembre offre spéciale \n",
"135 Plan B 2018 (électro) \n",
"270 save_the_date_populaire \n",
"132 Newsletter CCR (passerelle) \n",
"116 Fichier institutionnel (ne pas utiliser sans autorisation) \n",
"222 fichier institutionnel_ne_pas_toucher \n",
"266 reservations_payees_pass_culture_190422_au_310123 \n",
"102 DDCP spectateurs Marseille Jazz 18-19-21 \n",
"147 acid arab \n",
"\n",
" customer_id cumulative_customers \n",
"161 0.992333 0.195306 \n",
"160 0.992194 0.390585 \n",
"158 0.720493 0.532390 \n",
"5 0.232973 0.578242 \n",
"165 0.228389 0.623193 \n",
"19 0.152191 0.653146 \n",
"162 0.146171 0.681915 \n",
"169 0.124736 0.706465 \n",
"170 0.104155 0.726964 \n",
"166 0.094788 0.745620 \n",
"164 0.094067 0.764134 \n",
"126 0.093187 0.782474 \n",
"157 0.084249 0.799056 \n",
"216 0.081735 0.815142 \n",
"20 0.077025 0.830302 \n",
"115 0.069364 0.843954 \n",
"214 0.043927 0.852600 \n",
"189 0.039759 0.860425 \n",
"188 0.036266 0.867563 \n",
"163 0.032079 0.873876 \n",
"159 0.029949 0.879771 \n",
"34 0.029836 0.885643 \n",
"36 0.025549 0.890671 \n",
"127 0.024689 0.895531 \n",
"145 0.024577 0.900368 \n",
"61 0.023876 0.905067 \n",
"106 0.015229 0.908064 \n",
"39 0.015017 0.911020 \n",
"110 0.014746 0.913922 \n",
"48 0.012927 0.916466 \n",
"74 0.012887 0.919003 \n",
"72 0.011809 0.921327 \n",
"94 0.011624 0.923614 \n",
"23 0.011141 0.925807 \n",
"186 0.010433 0.927860 \n",
"108 0.010300 0.929888 \n",
"220 0.009937 0.931843 \n",
"105 0.009937 0.933799 \n",
"275 0.009844 0.935736 \n",
"86 0.008554 0.937420 \n",
"128 0.008263 0.939046 \n",
"260 0.008256 0.940671 \n",
"104 0.006900 0.942029 \n",
"32 0.006827 0.943373 \n",
"13 0.006754 0.944702 \n",
"228 0.006212 0.945925 \n",
"117 0.006047 0.947115 \n",
"22 0.005656 0.948228 \n",
"40 0.005464 0.949304 \n",
"240 0.005326 0.950352 \n",
"137 0.005259 0.951387 \n",
"93 0.005253 0.952421 \n",
"135 0.005081 0.953421 \n",
"270 0.004948 0.954395 \n",
"132 0.004783 0.955336 \n",
"116 0.004538 0.956229 \n",
"222 0.004532 0.957121 \n",
"266 0.004492 0.958005 \n",
"102 0.004432 0.958878 \n",
"147 0.004413 0.959746 "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.set_option(\"max_colwidth\", None)\n",
"print_main_target('1', 60)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "b57a28ac",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_2/target_information.csv\n",
"Nombre de ciblage : 260283\n",
"Nombre de client avec étiquette target : 233320\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>target_name</th>\n",
" <th>customer_id</th>\n",
" <th>cumulative_customers</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Schokoladentour &amp; Führungen Individuals</td>\n",
" <td>0.927906</td>\n",
" <td>0.831783</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Chocolateria Kurse 2023</td>\n",
" <td>0.073963</td>\n",
" <td>0.898084</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>mailxpert_contacts_2023-07-18_12-04-00 langue</td>\n",
" <td>0.025519</td>\n",
" <td>0.920959</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>mailxpert_contacts_2023-07-18_12-04-00</td>\n",
" <td>0.025519</td>\n",
" <td>0.943834</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Newsletter opt-in Allgemein</td>\n",
" <td>0.022836</td>\n",
" <td>0.964304</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Schokoladentour &amp; Führungen Gruppen / Schulen</td>\n",
" <td>0.011555</td>\n",
" <td>0.974662</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Newsletter DE</td>\n",
" <td>0.010749</td>\n",
" <td>0.984298</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Newsletter FR</td>\n",
" <td>0.008520</td>\n",
" <td>0.991936</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Newsletter EN</td>\n",
" <td>0.004286</td>\n",
" <td>0.995778</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Frauen in Zürich - Schulung</td>\n",
" <td>0.001003</td>\n",
" <td>0.996677</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>mailxpert_contacts_2023-07-18_13-25-45_inaktiv</td>\n",
" <td>0.000471</td>\n",
" <td>0.997099</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Opt-in-Website DE</td>\n",
" <td>0.000030</td>\n",
" <td>0.997126</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Opt-in Website EN</td>\n",
" <td>0.000009</td>\n",
" <td>0.997134</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Opt-in Website FR</td>\n",
" <td>0.000004</td>\n",
" <td>0.997138</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Votre première liste</td>\n",
" <td>0.000004</td>\n",
" <td>0.997142</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Activated contact EN</td>\n",
" <td>0.000004</td>\n",
" <td>0.997145</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Activated contact FR</td>\n",
" <td>0.000004</td>\n",
" <td>0.997149</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Activated contact DE</td>\n",
" <td>0.000004</td>\n",
" <td>0.997153</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" target_name customer_id \\\n",
"13 Schokoladentour & Führungen Individuals 0.927906 \n",
"3 Chocolateria Kurse 2023 0.073963 \n",
"16 mailxpert_contacts_2023-07-18_12-04-00 langue 0.025519 \n",
"15 mailxpert_contacts_2023-07-18_12-04-00 0.025519 \n",
"8 Newsletter opt-in Allgemein 0.022836 \n",
"12 Schokoladentour & Führungen Gruppen / Schulen 0.011555 \n",
"5 Newsletter DE 0.010749 \n",
"7 Newsletter FR 0.008520 \n",
"6 Newsletter EN 0.004286 \n",
"4 Frauen in Zürich - Schulung 0.001003 \n",
"17 mailxpert_contacts_2023-07-18_13-25-45_inaktiv 0.000471 \n",
"11 Opt-in-Website DE 0.000030 \n",
"9 Opt-in Website EN 0.000009 \n",
"10 Opt-in Website FR 0.000004 \n",
"14 Votre première liste 0.000004 \n",
"1 Activated contact EN 0.000004 \n",
"2 Activated contact FR 0.000004 \n",
"0 Activated contact DE 0.000004 \n",
"\n",
" cumulative_customers \n",
"13 0.831783 \n",
"3 0.898084 \n",
"16 0.920959 \n",
"15 0.943834 \n",
"8 0.964304 \n",
"12 0.974662 \n",
"5 0.984298 \n",
"7 0.991936 \n",
"6 0.995778 \n",
"4 0.996677 \n",
"17 0.997099 \n",
"11 0.997126 \n",
"9 0.997134 \n",
"10 0.997138 \n",
"14 0.997142 \n",
"1 0.997145 \n",
"2 0.997149 \n",
"0 0.997153 "
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print_main_target('2', 25)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "9a65991f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_3/target_information.csv\n",
"Nombre de ciblage : 1617362\n",
"Nombre de client avec étiquette target : 257018\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>target_name</th>\n",
" <th>customer_id</th>\n",
" <th>cumulative_customers</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>67</th>\n",
" <td>MKG_NLmensuelle_2021_OK</td>\n",
" <td>0.972348</td>\n",
" <td>0.154518</td>\n",
" </tr>\n",
" <tr>\n",
" <th>66</th>\n",
" <td>MKG_NLmensuelle_2021</td>\n",
" <td>0.956007</td>\n",
" <td>0.306439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>consent_optin_nl</td>\n",
" <td>0.636648</td>\n",
" <td>0.407609</td>\n",
" </tr>\n",
" <tr>\n",
" <th>115</th>\n",
" <td>consent_optin_general</td>\n",
" <td>0.602506</td>\n",
" <td>0.503355</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>Mkg_NL_mensuelle3</td>\n",
" <td>0.404162</td>\n",
" <td>0.567581</td>\n",
" </tr>\n",
" <tr>\n",
" <th>125</th>\n",
" <td>consent_optout_general</td>\n",
" <td>0.368126</td>\n",
" <td>0.626081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>104</th>\n",
" <td>TEST LOCBASE</td>\n",
" <td>0.350532</td>\n",
" <td>0.681784</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>MKG_Non_inscrit_liste_08-22</td>\n",
" <td>0.310605</td>\n",
" <td>0.731143</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>consent_optin_general_HISTORIQUE</td>\n",
" <td>0.301345</td>\n",
" <td>0.779030</td>\n",
" </tr>\n",
" <tr>\n",
" <th>80</th>\n",
" <td>Mkg_Zone_C</td>\n",
" <td>0.135298</td>\n",
" <td>0.800531</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Acheteurs_100km_visite_depuismax5ans</td>\n",
" <td>0.091149</td>\n",
" <td>0.815015</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>Inscription NL ancien site web</td>\n",
" <td>0.083477</td>\n",
" <td>0.828281</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>consent_optin_equestre</td>\n",
" <td>0.083216</td>\n",
" <td>0.841505</td>\n",
" </tr>\n",
" <tr>\n",
" <th>79</th>\n",
" <td>Mkg_Zone_B</td>\n",
" <td>0.079889</td>\n",
" <td>0.854200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>MKG_2022_ZoneB&amp;ZoneC_Famille</td>\n",
" <td>0.072536</td>\n",
" <td>0.865727</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111</th>\n",
" <td>consent_optin_b2b</td>\n",
" <td>0.064388</td>\n",
" <td>0.875959</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>Soft_Bounce_yahoo</td>\n",
" <td>0.064182</td>\n",
" <td>0.886158</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>Scénario Anniversaire</td>\n",
" <td>0.051249</td>\n",
" <td>0.894303</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>B2B_scolaire_et_centres_de_loisirs_2023</td>\n",
" <td>0.046732</td>\n",
" <td>0.901729</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>B2B_Sans étiquette</td>\n",
" <td>0.040472</td>\n",
" <td>0.908160</td>\n",
" </tr>\n",
" <tr>\n",
" <th>122</th>\n",
" <td>consent_optout_equestre</td>\n",
" <td>0.038865</td>\n",
" <td>0.914336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>B2B_Inscrits newsletter Scolaires</td>\n",
" <td>0.038075</td>\n",
" <td>0.920387</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>B2B_historique_newsletter_SCOLAIRES</td>\n",
" <td>0.038040</td>\n",
" <td>0.926432</td>\n",
" </tr>\n",
" <tr>\n",
" <th>118</th>\n",
" <td>consent_optin_jdp</td>\n",
" <td>0.036110</td>\n",
" <td>0.932170</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>MKG_aire cantilienne</td>\n",
" <td>0.031908</td>\n",
" <td>0.937241</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>B2B_Sans étiquette FR+BE</td>\n",
" <td>0.029165</td>\n",
" <td>0.941876</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108</th>\n",
" <td>b2b - écoles mai 2021</td>\n",
" <td>0.028574</td>\n",
" <td>0.946416</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>Ouvreur_NL_juin_2021</td>\n",
" <td>0.018193</td>\n",
" <td>0.949308</td>\n",
" </tr>\n",
" <tr>\n",
" <th>126</th>\n",
" <td>consent_optout_jdp</td>\n",
" <td>0.016816</td>\n",
" <td>0.951980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>127</th>\n",
" <td>consent_optout_nl</td>\n",
" <td>0.016633</td>\n",
" <td>0.954623</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>B2B_CE_2023</td>\n",
" <td>0.016489</td>\n",
" <td>0.957243</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>Visiteurs Aout-Sept sans questionnaire</td>\n",
" <td>0.016275</td>\n",
" <td>0.959830</td>\n",
" </tr>\n",
" <tr>\n",
" <th>89</th>\n",
" <td>Pass Annuel en cours de validité</td>\n",
" <td>0.011540</td>\n",
" <td>0.961663</td>\n",
" </tr>\n",
" <tr>\n",
" <th>114</th>\n",
" <td>consent_optin_expositions</td>\n",
" <td>0.011388</td>\n",
" <td>0.963473</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>B2B_Sans étiquette hors FR+BE</td>\n",
" <td>0.011307</td>\n",
" <td>0.965270</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>consent_optin_evenements</td>\n",
" <td>0.011240</td>\n",
" <td>0.967056</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>B2B_liste_à_requalifier_CE</td>\n",
" <td>0.010742</td>\n",
" <td>0.968763</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>consent_optin_abonnes_passannuels</td>\n",
" <td>0.009665</td>\n",
" <td>0.970299</td>\n",
" </tr>\n",
" <tr>\n",
" <th>152</th>\n",
" <td>liste mécènes donateurs 01012023-31102023</td>\n",
" <td>0.008746</td>\n",
" <td>0.971689</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>B2B_liste_à_requalifier_SCOLAIRES</td>\n",
" <td>0.008688</td>\n",
" <td>0.973070</td>\n",
" </tr>\n",
" <tr>\n",
" <th>153</th>\n",
" <td>liste newsletter mécénat</td>\n",
" <td>0.008606</td>\n",
" <td>0.974437</td>\n",
" </tr>\n",
" <tr>\n",
" <th>144</th>\n",
" <td>liste des donateurs iraiser don ponctuel 10122020-20012022</td>\n",
" <td>0.007591</td>\n",
" <td>0.975644</td>\n",
" </tr>\n",
" <tr>\n",
" <th>134</th>\n",
" <td>don ponctuel iRaiser 250122</td>\n",
" <td>0.007587</td>\n",
" <td>0.976849</td>\n",
" </tr>\n",
" <tr>\n",
" <th>77</th>\n",
" <td>MKG_visiteurs_juin_ES</td>\n",
" <td>0.007498</td>\n",
" <td>0.978041</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>MKG_acheteurs_domaine_noel_2021</td>\n",
" <td>0.006758</td>\n",
" <td>0.979115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Acheteurs Journées des Plantes oct. 2022</td>\n",
" <td>0.006038</td>\n",
" <td>0.980074</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>Réponse au formulaire de satisfaction</td>\n",
" <td>0.005871</td>\n",
" <td>0.981007</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>B2B_Inscrits newsletter Collectivités et CSE</td>\n",
" <td>0.005786</td>\n",
" <td>0.981927</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>B2B_historique_newsletter_CE</td>\n",
" <td>0.005735</td>\n",
" <td>0.982838</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>MKG_acheteurs_1mois_pass_sanitaire</td>\n",
" <td>0.005731</td>\n",
" <td>0.983749</td>\n",
" </tr>\n",
" <tr>\n",
" <th>157</th>\n",
" <td>mec_expos_automne_2023</td>\n",
" <td>0.004972</td>\n",
" <td>0.984539</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Abonnés Pass Annuel - dynamique</td>\n",
" <td>0.004856</td>\n",
" <td>0.985311</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>liste diffusion invitation Ingres</td>\n",
" <td>0.004564</td>\n",
" <td>0.986036</td>\n",
" </tr>\n",
" <tr>\n",
" <th>161</th>\n",
" <td>rattrpostvisite_nov21</td>\n",
" <td>0.004070</td>\n",
" <td>0.986683</td>\n",
" </tr>\n",
" <tr>\n",
" <th>166</th>\n",
" <td>réunion publique forêt 2023</td>\n",
" <td>0.003362</td>\n",
" <td>0.987217</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>Rattrapage_postvisite_novdec21v0</td>\n",
" <td>0.003358</td>\n",
" <td>0.987750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>MKG_acheteurs_domaine_et_noel_2021</td>\n",
" <td>0.003054</td>\n",
" <td>0.988236</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>Rattrapage_postvisite_novdec21_VF</td>\n",
" <td>0.003015</td>\n",
" <td>0.988715</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>B2B_GUIDES_2023</td>\n",
" <td>0.002650</td>\n",
" <td>0.989136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>B2B_TOANGLOPHONE_2023</td>\n",
" <td>0.002521</td>\n",
" <td>0.989537</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>B2B_liste_à_requalifier_GUIDES</td>\n",
" <td>0.002405</td>\n",
" <td>0.989919</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>MKG_2021_Acheteurs_JDP_Octobre_rattrap</td>\n",
" <td>0.002268</td>\n",
" <td>0.990279</td>\n",
" </tr>\n",
" <tr>\n",
" <th>159</th>\n",
" <td>rattrapage1211_logs-071021_121121</td>\n",
" <td>0.002120</td>\n",
" <td>0.990616</td>\n",
" </tr>\n",
" <tr>\n",
" <th>141</th>\n",
" <td>jdp_invités_2_entrées_ oct2023</td>\n",
" <td>0.002120</td>\n",
" <td>0.990953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109</th>\n",
" <td>château de chantilly questionnaire</td>\n",
" <td>0.002004</td>\n",
" <td>0.991272</td>\n",
" </tr>\n",
" <tr>\n",
" <th>150</th>\n",
" <td>liste invités avant-première jdp mai 2023</td>\n",
" <td>0.002000</td>\n",
" <td>0.991589</td>\n",
" </tr>\n",
" <tr>\n",
" <th>136</th>\n",
" <td>gece</td>\n",
" <td>0.001980</td>\n",
" <td>0.991904</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>B2B_Autocariste_2023</td>\n",
" <td>0.001692</td>\n",
" <td>0.992173</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158</th>\n",
" <td>mkg_2021_acheteurs_jdp_octobre_rattr_exclure_new</td>\n",
" <td>0.001607</td>\n",
" <td>0.992428</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>Formation_journéedesplantes75</td>\n",
" <td>0.001541</td>\n",
" <td>0.992673</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" target_name customer_id \\\n",
"67 MKG_NLmensuelle_2021_OK 0.972348 \n",
"66 MKG_NLmensuelle_2021 0.956007 \n",
"119 consent_optin_nl 0.636648 \n",
"115 consent_optin_general 0.602506 \n",
"78 Mkg_NL_mensuelle3 0.404162 \n",
"125 consent_optout_general 0.368126 \n",
"104 TEST LOCBASE 0.350532 \n",
"68 MKG_Non_inscrit_liste_08-22 0.310605 \n",
"116 consent_optin_general_HISTORIQUE 0.301345 \n",
"80 Mkg_Zone_C 0.135298 \n",
"7 Acheteurs_100km_visite_depuismax5ans 0.091149 \n",
"54 Inscription NL ancien site web 0.083477 \n",
"112 consent_optin_equestre 0.083216 \n",
"79 Mkg_Zone_B 0.079889 \n",
"65 MKG_2022_ZoneB&ZoneC_Famille 0.072536 \n",
"111 consent_optin_b2b 0.064388 \n",
"102 Soft_Bounce_yahoo 0.064182 \n",
"100 Scénario Anniversaire 0.051249 \n",
"37 B2B_scolaire_et_centres_de_loisirs_2023 0.046732 \n",
"20 B2B_Sans étiquette 0.040472 \n",
"122 consent_optout_equestre 0.038865 \n",
"17 B2B_Inscrits newsletter Scolaires 0.038075 \n",
"28 B2B_historique_newsletter_SCOLAIRES 0.038040 \n",
"118 consent_optin_jdp 0.036110 \n",
"76 MKG_aire cantilienne 0.031908 \n",
"21 B2B_Sans étiquette FR+BE 0.029165 \n",
"108 b2b - écoles mai 2021 0.028574 \n",
"86 Ouvreur_NL_juin_2021 0.018193 \n",
"126 consent_optout_jdp 0.016816 \n",
"127 consent_optout_nl 0.016633 \n",
"13 B2B_CE_2023 0.016489 \n",
"106 Visiteurs Aout-Sept sans questionnaire 0.016275 \n",
"89 Pass Annuel en cours de validité 0.011540 \n",
"114 consent_optin_expositions 0.011388 \n",
"22 B2B_Sans étiquette hors FR+BE 0.011307 \n",
"113 consent_optin_evenements 0.011240 \n",
"32 B2B_liste_à_requalifier_CE 0.010742 \n",
"110 consent_optin_abonnes_passannuels 0.009665 \n",
"152 liste mécènes donateurs 01012023-31102023 0.008746 \n",
"34 B2B_liste_à_requalifier_SCOLAIRES 0.008688 \n",
"153 liste newsletter mécénat 0.008606 \n",
"144 liste des donateurs iraiser don ponctuel 10122020-20012022 0.007591 \n",
"134 don ponctuel iRaiser 250122 0.007587 \n",
"77 MKG_visiteurs_juin_ES 0.007498 \n",
"72 MKG_acheteurs_domaine_noel_2021 0.006758 \n",
"6 Acheteurs Journées des Plantes oct. 2022 0.006038 \n",
"96 Réponse au formulaire de satisfaction 0.005871 \n",
"15 B2B_Inscrits newsletter Collectivités et CSE 0.005786 \n",
"26 B2B_historique_newsletter_CE 0.005735 \n",
"70 MKG_acheteurs_1mois_pass_sanitaire 0.005731 \n",
"157 mec_expos_automne_2023 0.004972 \n",
"2 Abonnés Pass Annuel - dynamique 0.004856 \n",
"146 liste diffusion invitation Ingres 0.004564 \n",
"161 rattrpostvisite_nov21 0.004070 \n",
"166 réunion publique forêt 2023 0.003362 \n",
"94 Rattrapage_postvisite_novdec21v0 0.003358 \n",
"71 MKG_acheteurs_domaine_et_noel_2021 0.003054 \n",
"93 Rattrapage_postvisite_novdec21_VF 0.003015 \n",
"14 B2B_GUIDES_2023 0.002650 \n",
"23 B2B_TOANGLOPHONE_2023 0.002521 \n",
"33 B2B_liste_à_requalifier_GUIDES 0.002405 \n",
"64 MKG_2021_Acheteurs_JDP_Octobre_rattrap 0.002268 \n",
"159 rattrapage1211_logs-071021_121121 0.002120 \n",
"141 jdp_invités_2_entrées_ oct2023 0.002120 \n",
"109 château de chantilly questionnaire 0.002004 \n",
"150 liste invités avant-première jdp mai 2023 0.002000 \n",
"136 gece 0.001980 \n",
"12 B2B_Autocariste_2023 0.001692 \n",
"158 mkg_2021_acheteurs_jdp_octobre_rattr_exclure_new 0.001607 \n",
"43 Formation_journéedesplantes75 0.001541 \n",
"\n",
" cumulative_customers \n",
"67 0.154518 \n",
"66 0.306439 \n",
"119 0.407609 \n",
"115 0.503355 \n",
"78 0.567581 \n",
"125 0.626081 \n",
"104 0.681784 \n",
"68 0.731143 \n",
"116 0.779030 \n",
"80 0.800531 \n",
"7 0.815015 \n",
"54 0.828281 \n",
"112 0.841505 \n",
"79 0.854200 \n",
"65 0.865727 \n",
"111 0.875959 \n",
"102 0.886158 \n",
"100 0.894303 \n",
"37 0.901729 \n",
"20 0.908160 \n",
"122 0.914336 \n",
"17 0.920387 \n",
"28 0.926432 \n",
"118 0.932170 \n",
"76 0.937241 \n",
"21 0.941876 \n",
"108 0.946416 \n",
"86 0.949308 \n",
"126 0.951980 \n",
"127 0.954623 \n",
"13 0.957243 \n",
"106 0.959830 \n",
"89 0.961663 \n",
"114 0.963473 \n",
"22 0.965270 \n",
"113 0.967056 \n",
"32 0.968763 \n",
"110 0.970299 \n",
"152 0.971689 \n",
"34 0.973070 \n",
"153 0.974437 \n",
"144 0.975644 \n",
"134 0.976849 \n",
"77 0.978041 \n",
"72 0.979115 \n",
"6 0.980074 \n",
"96 0.981007 \n",
"15 0.981927 \n",
"26 0.982838 \n",
"70 0.983749 \n",
"157 0.984539 \n",
"2 0.985311 \n",
"146 0.986036 \n",
"161 0.986683 \n",
"166 0.987217 \n",
"94 0.987750 \n",
"71 0.988236 \n",
"93 0.988715 \n",
"14 0.989136 \n",
"23 0.989537 \n",
"33 0.989919 \n",
"64 0.990279 \n",
"159 0.990616 \n",
"141 0.990953 \n",
"109 0.991272 \n",
"150 0.991589 \n",
"136 0.991904 \n",
"12 0.992173 \n",
"158 0.992428 \n",
"43 0.992673 "
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print_main_target('3', 70)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "c66a4dc1",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_rows', None)\n"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "5f34b8bf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_4/target_information.csv\n",
"Nombre de ciblage : 4627640\n",
"Nombre de client avec étiquette target : 320813\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>target_name</th>\n",
" <th>customer_id</th>\n",
" <th>cumulative_customers</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>232</th>\n",
" <td>Tous les contacts mis à jour</td>\n",
" <td>0.999991</td>\n",
" <td>0.069325</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>Base données</td>\n",
" <td>0.999991</td>\n",
" <td>0.138650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>191</th>\n",
" <td>Office de Tourisme</td>\n",
" <td>0.999991</td>\n",
" <td>0.207974</td>\n",
" </tr>\n",
" <tr>\n",
" <th>128</th>\n",
" <td>Globale sans VIP</td>\n",
" <td>0.955488</td>\n",
" <td>0.274214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>Contacts structures</td>\n",
" <td>0.929969</td>\n",
" <td>0.338684</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>Cible gratuité IMA COMEDY</td>\n",
" <td>0.636246</td>\n",
" <td>0.382792</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>2 IEME ENVOI IMA COMEDY CLUB</td>\n",
" <td>0.630389</td>\n",
" <td>0.426494</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>Actions Marketing ARABOFOLIES</td>\n",
" <td>0.627917</td>\n",
" <td>0.470025</td>\n",
" </tr>\n",
" <tr>\n",
" <th>171</th>\n",
" <td>Liste globale sans VIP</td>\n",
" <td>0.582183</td>\n",
" <td>0.510385</td>\n",
" </tr>\n",
" <tr>\n",
" <th>126</th>\n",
" <td>Formulaire inscription mallette \"Cultures en partage\"</td>\n",
" <td>0.532831</td>\n",
" <td>0.547324</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>CAMPAGNE ADHESION 2023</td>\n",
" <td>0.449371</td>\n",
" <td>0.578477</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234</th>\n",
" <td>Tous les optins</td>\n",
" <td>0.412546</td>\n",
" <td>0.607076</td>\n",
" </tr>\n",
" <tr>\n",
" <th>192</th>\n",
" <td>Optin 2023</td>\n",
" <td>0.365057</td>\n",
" <td>0.632384</td>\n",
" </tr>\n",
" <tr>\n",
" <th>170</th>\n",
" <td>Liste globale optin-15-01-2021</td>\n",
" <td>0.325482</td>\n",
" <td>0.654948</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>10-03-sb-dolist</td>\n",
" <td>0.193833</td>\n",
" <td>0.668386</td>\n",
" </tr>\n",
" <tr>\n",
" <th>195</th>\n",
" <td>Origine - Nouba</td>\n",
" <td>0.192452</td>\n",
" <td>0.681728</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158</th>\n",
" <td>LIVE2022_Intérêt Expositions musée</td>\n",
" <td>0.173550</td>\n",
" <td>0.693759</td>\n",
" </tr>\n",
" <tr>\n",
" <th>414</th>\n",
" <td>old_Intéressés par la Musique</td>\n",
" <td>0.166505</td>\n",
" <td>0.705302</td>\n",
" </tr>\n",
" <tr>\n",
" <th>415</th>\n",
" <td>old_Intérêt Danse</td>\n",
" <td>0.163572</td>\n",
" <td>0.716642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>Cible offre DAOUD DEPARDON</td>\n",
" <td>0.130372</td>\n",
" <td>0.725680</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>13-04-2022-vente 2021</td>\n",
" <td>0.128804</td>\n",
" <td>0.734609</td>\n",
" </tr>\n",
" <tr>\n",
" <th>73</th>\n",
" <td>Arabofolies Juillet 2022</td>\n",
" <td>0.109123</td>\n",
" <td>0.742174</td>\n",
" </tr>\n",
" <tr>\n",
" <th>137</th>\n",
" <td>Intérêt LGBTQ+</td>\n",
" <td>0.108917</td>\n",
" <td>0.749725</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>blacklistés ima</td>\n",
" <td>0.105407</td>\n",
" <td>0.757032</td>\n",
" </tr>\n",
" <tr>\n",
" <th>208</th>\n",
" <td>Public traditionnel</td>\n",
" <td>0.083821</td>\n",
" <td>0.762843</td>\n",
" </tr>\n",
" <tr>\n",
" <th>159</th>\n",
" <td>LIVE2022_Intérêt Humour</td>\n",
" <td>0.082858</td>\n",
" <td>0.768587</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>Cible jeunes humour</td>\n",
" <td>0.080312</td>\n",
" <td>0.774155</td>\n",
" </tr>\n",
" <tr>\n",
" <th>213</th>\n",
" <td>Relance gratuité IMA COMEDY CLUB</td>\n",
" <td>0.079205</td>\n",
" <td>0.779646</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>Cible rencontres et débats</td>\n",
" <td>0.076743</td>\n",
" <td>0.784966</td>\n",
" </tr>\n",
" <tr>\n",
" <th>189</th>\n",
" <td>Nouveaux inscrits newsletter</td>\n",
" <td>0.076200</td>\n",
" <td>0.790249</td>\n",
" </tr>\n",
" <tr>\n",
" <th>349</th>\n",
" <td>interet nuit du cinéma</td>\n",
" <td>0.072260</td>\n",
" <td>0.795258</td>\n",
" </tr>\n",
" <tr>\n",
" <th>141</th>\n",
" <td>Intérêt prononcé pour la nuit du ramadan</td>\n",
" <td>0.072254</td>\n",
" <td>0.800267</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>Cible Algérie</td>\n",
" <td>0.070337</td>\n",
" <td>0.805143</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>Cible News offre spéciale humour</td>\n",
" <td>0.069296</td>\n",
" <td>0.809947</td>\n",
" </tr>\n",
" <tr>\n",
" <th>140</th>\n",
" <td>Intérêt musique électro</td>\n",
" <td>0.068716</td>\n",
" <td>0.814711</td>\n",
" </tr>\n",
" <tr>\n",
" <th>280</th>\n",
" <td>cible Histoire et feminisme</td>\n",
" <td>0.068585</td>\n",
" <td>0.819466</td>\n",
" </tr>\n",
" <tr>\n",
" <th>174</th>\n",
" <td>Liste relais pour présentation 2023</td>\n",
" <td>0.067404</td>\n",
" <td>0.824139</td>\n",
" </tr>\n",
" <tr>\n",
" <th>138</th>\n",
" <td>Intérêt musique Orientale</td>\n",
" <td>0.066082</td>\n",
" <td>0.828720</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>16-07-21-nuit-du-cinema</td>\n",
" <td>0.065166</td>\n",
" <td>0.833237</td>\n",
" </tr>\n",
" <tr>\n",
" <th>204</th>\n",
" <td>Profil Client Expos Divas (Geo)</td>\n",
" <td>0.063401</td>\n",
" <td>0.837633</td>\n",
" </tr>\n",
" <tr>\n",
" <th>265</th>\n",
" <td>araborolies/Divas/relance</td>\n",
" <td>0.061647</td>\n",
" <td>0.841906</td>\n",
" </tr>\n",
" <tr>\n",
" <th>203</th>\n",
" <td>Profil Client Expo Divas</td>\n",
" <td>0.061138</td>\n",
" <td>0.846145</td>\n",
" </tr>\n",
" <tr>\n",
" <th>233</th>\n",
" <td>Tous les inscrits aux newsletters via le formulaire du site web</td>\n",
" <td>0.057407</td>\n",
" <td>0.850125</td>\n",
" </tr>\n",
" <tr>\n",
" <th>243</th>\n",
" <td>VIP Générale</td>\n",
" <td>0.053682</td>\n",
" <td>0.853846</td>\n",
" </tr>\n",
" <tr>\n",
" <th>226</th>\n",
" <td>Strcutures sans VIP</td>\n",
" <td>0.053396</td>\n",
" <td>0.857548</td>\n",
" </tr>\n",
" <tr>\n",
" <th>67</th>\n",
" <td>Agi pour buren</td>\n",
" <td>0.051575</td>\n",
" <td>0.861123</td>\n",
" </tr>\n",
" <tr>\n",
" <th>144</th>\n",
" <td>Invitation à l'exposition Palestine LANG</td>\n",
" <td>0.051092</td>\n",
" <td>0.864665</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>Acheteurs individuels de l'expo Juifs d'orient statique</td>\n",
" <td>0.046526</td>\n",
" <td>0.867891</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>Acheteurs individuels de l'expo Juifs d'orient</td>\n",
" <td>0.046513</td>\n",
" <td>0.871115</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>Cible arabic Sound system</td>\n",
" <td>0.046164</td>\n",
" <td>0.874316</td>\n",
" </tr>\n",
" <tr>\n",
" <th>244</th>\n",
" <td>VIP STATIQUE</td>\n",
" <td>0.041158</td>\n",
" <td>0.877169</td>\n",
" </tr>\n",
" <tr>\n",
" <th>245</th>\n",
" <td>VIP Téléchargement</td>\n",
" <td>0.040737</td>\n",
" <td>0.879993</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>Cible scolaire 2022</td>\n",
" <td>0.040313</td>\n",
" <td>0.882788</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>Cible Maroc</td>\n",
" <td>0.039827</td>\n",
" <td>0.885549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>Cible Maroc</td>\n",
" <td>0.039827</td>\n",
" <td>0.888310</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>26mai-2023-Structures-invit-palestine</td>\n",
" <td>0.039188</td>\n",
" <td>0.891027</td>\n",
" </tr>\n",
" <tr>\n",
" <th>393</th>\n",
" <td>liste_contacts_agi_2021_02_16_</td>\n",
" <td>0.033618</td>\n",
" <td>0.893357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>450</th>\n",
" <td>sb-fichier-eudonet-ok-18-05-21</td>\n",
" <td>0.032056</td>\n",
" <td>0.895579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>04_11_22_eudonet</td>\n",
" <td>0.031857</td>\n",
" <td>0.897788</td>\n",
" </tr>\n",
" <tr>\n",
" <th>215</th>\n",
" <td>SB-18-05-VIP-eudonet</td>\n",
" <td>0.031857</td>\n",
" <td>0.899996</td>\n",
" </tr>\n",
" <tr>\n",
" <th>175</th>\n",
" <td>Liste vernissage</td>\n",
" <td>0.031364</td>\n",
" <td>0.902171</td>\n",
" </tr>\n",
" <tr>\n",
" <th>235</th>\n",
" <td>Tous les relais</td>\n",
" <td>0.031090</td>\n",
" <td>0.904326</td>\n",
" </tr>\n",
" <tr>\n",
" <th>252</th>\n",
" <td>Visiteurs expo pour questionnaires</td>\n",
" <td>0.029930</td>\n",
" <td>0.906401</td>\n",
" </tr>\n",
" <tr>\n",
" <th>223</th>\n",
" <td>Scolaires - Actions Educatives 24/11/2021</td>\n",
" <td>0.029871</td>\n",
" <td>0.908472</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>Cible Musique Judeo-arabe</td>\n",
" <td>0.029266</td>\n",
" <td>0.910501</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>15-09-2023-Cible-Palestine</td>\n",
" <td>0.028531</td>\n",
" <td>0.912478</td>\n",
" </tr>\n",
" <tr>\n",
" <th>162</th>\n",
" <td>LIVE2022_Intérêts Rencontres, débats et conférences</td>\n",
" <td>0.026928</td>\n",
" <td>0.914345</td>\n",
" </tr>\n",
" <tr>\n",
" <th>282</th>\n",
" <td>cible photo</td>\n",
" <td>0.026056</td>\n",
" <td>0.916152</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>26-MAI_STRUCTURE-2023-OK</td>\n",
" <td>0.025495</td>\n",
" <td>0.917919</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>07-12-20-Relais-invitatation-divas</td>\n",
" <td>0.024909</td>\n",
" <td>0.919646</td>\n",
" </tr>\n",
" <tr>\n",
" <th>410</th>\n",
" <td>old_Amis de l'IMA</td>\n",
" <td>0.023160</td>\n",
" <td>0.921251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>222</th>\n",
" <td>Scolaires - Actions Educatives 24/01/2023</td>\n",
" <td>0.022724</td>\n",
" <td>0.922827</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>PALESTINE</td>\n",
" <td>0.020903</td>\n",
" <td>0.924276</td>\n",
" </tr>\n",
" <tr>\n",
" <th>249</th>\n",
" <td>Vignes et tilleuls</td>\n",
" <td>0.020439</td>\n",
" <td>0.925693</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>26-mai-11H10-relais</td>\n",
" <td>0.019578</td>\n",
" <td>0.927050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>Contacts Librairie</td>\n",
" <td>0.019114</td>\n",
" <td>0.928375</td>\n",
" </tr>\n",
" <tr>\n",
" <th>194</th>\n",
" <td>Origine - Inscription manuelle</td>\n",
" <td>0.018307</td>\n",
" <td>0.929644</td>\n",
" </tr>\n",
" <tr>\n",
" <th>196</th>\n",
" <td>Origine - QR code</td>\n",
" <td>0.018294</td>\n",
" <td>0.930913</td>\n",
" </tr>\n",
" <tr>\n",
" <th>59</th>\n",
" <td>Acheteurs Daoud Depardon</td>\n",
" <td>0.018232</td>\n",
" <td>0.932176</td>\n",
" </tr>\n",
" <tr>\n",
" <th>473</th>\n",
" <td>événements autour de Habibi</td>\n",
" <td>0.017755</td>\n",
" <td>0.933407</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>17-04-21-nuits-ducinema</td>\n",
" <td>0.017484</td>\n",
" <td>0.934619</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>07-12-20-liste-invites-presentation-divas</td>\n",
" <td>0.017337</td>\n",
" <td>0.935821</td>\n",
" </tr>\n",
" <tr>\n",
" <th>278</th>\n",
" <td>catégorie Cinéma</td>\n",
" <td>0.016748</td>\n",
" <td>0.936982</td>\n",
" </tr>\n",
" <tr>\n",
" <th>250</th>\n",
" <td>Visiteurs Palestine</td>\n",
" <td>0.016302</td>\n",
" <td>0.938113</td>\n",
" </tr>\n",
" <tr>\n",
" <th>248</th>\n",
" <td>Vernissages Algérie</td>\n",
" <td>0.014703</td>\n",
" <td>0.939132</td>\n",
" </tr>\n",
" <tr>\n",
" <th>242</th>\n",
" <td>VIP Algérie</td>\n",
" <td>0.014666</td>\n",
" <td>0.940149</td>\n",
" </tr>\n",
" <tr>\n",
" <th>445</th>\n",
" <td>save the date invités vernissage</td>\n",
" <td>0.014432</td>\n",
" <td>0.941149</td>\n",
" </tr>\n",
" <tr>\n",
" <th>205</th>\n",
" <td>Profil Contact Expo Divas - Juillet</td>\n",
" <td>0.014282</td>\n",
" <td>0.942139</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>17-04-2021-autres-liste-statique-cinema</td>\n",
" <td>0.013862</td>\n",
" <td>0.943100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>279</th>\n",
" <td>catégorie rencontres et débats</td>\n",
" <td>0.013285</td>\n",
" <td>0.944021</td>\n",
" </tr>\n",
" <tr>\n",
" <th>325</th>\n",
" <td>fichier-dolist-05-12-20-relais</td>\n",
" <td>0.013163</td>\n",
" <td>0.944934</td>\n",
" </tr>\n",
" <tr>\n",
" <th>177</th>\n",
" <td>MAILING SAVE THE DATE PARFUMS</td>\n",
" <td>0.013154</td>\n",
" <td>0.945846</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>Cible enfants/ famille janvier 23</td>\n",
" <td>0.012515</td>\n",
" <td>0.946713</td>\n",
" </tr>\n",
" <tr>\n",
" <th>129</th>\n",
" <td>Globale vernissage Samarcande</td>\n",
" <td>0.011078</td>\n",
" <td>0.947481</td>\n",
" </tr>\n",
" <tr>\n",
" <th>251</th>\n",
" <td>Visiteurs Palestine mi-expo questionnaire</td>\n",
" <td>0.010735</td>\n",
" <td>0.948225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>Cible LIBAN</td>\n",
" <td>0.010003</td>\n",
" <td>0.948919</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>18-11-2021-liste i,augurationdu23-18h3à</td>\n",
" <td>0.009975</td>\n",
" <td>0.949610</td>\n",
" </tr>\n",
" <tr>\n",
" <th>431</th>\n",
" <td>professionnels de l'écologie</td>\n",
" <td>0.009769</td>\n",
" <td>0.950288</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142</th>\n",
" <td>Intérêt écologie (pro)</td>\n",
" <td>0.009769</td>\n",
" <td>0.950965</td>\n",
" </tr>\n",
" <tr>\n",
" <th>323</th>\n",
" <td>fichier pro écologie</td>\n",
" <td>0.009769</td>\n",
" <td>0.951642</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" target_name \\\n",
"232 Tous les contacts mis à jour \n",
"76 Base données \n",
"191 Office de Tourisme \n",
"128 Globale sans VIP \n",
"112 Contacts structures \n",
"98 Cible gratuité IMA COMEDY \n",
"23 2 IEME ENVOI IMA COMEDY CLUB \n",
"64 Actions Marketing ARABOFOLIES \n",
"171 Liste globale sans VIP \n",
"126 Formulaire inscription mallette \"Cultures en partage\" \n",
"78 CAMPAGNE ADHESION 2023 \n",
"234 Tous les optins \n",
"192 Optin 2023 \n",
"170 Liste globale optin-15-01-2021 \n",
"10 10-03-sb-dolist \n",
"195 Origine - Nouba \n",
"158 LIVE2022_Intérêt Expositions musée \n",
"414 old_Intéressés par la Musique \n",
"415 old_Intérêt Danse \n",
"100 Cible offre DAOUD DEPARDON \n",
"12 13-04-2022-vente 2021 \n",
"73 Arabofolies Juillet 2022 \n",
"137 Intérêt LGBTQ+ \n",
"274 blacklistés ima \n",
"208 Public traditionnel \n",
"159 LIVE2022_Intérêt Humour \n",
"99 Cible jeunes humour \n",
"213 Relance gratuité IMA COMEDY CLUB \n",
"101 Cible rencontres et débats \n",
"189 Nouveaux inscrits newsletter \n",
"349 interet nuit du cinéma \n",
"141 Intérêt prononcé pour la nuit du ramadan \n",
"87 Cible Algérie \n",
"93 Cible News offre spéciale humour \n",
"140 Intérêt musique électro \n",
"280 cible Histoire et feminisme \n",
"174 Liste relais pour présentation 2023 \n",
"138 Intérêt musique Orientale \n",
"18 16-07-21-nuit-du-cinema \n",
"204 Profil Client Expos Divas (Geo) \n",
"265 araborolies/Divas/relance \n",
"203 Profil Client Expo Divas \n",
"233 Tous les inscrits aux newsletters via le formulaire du site web \n",
"243 VIP Générale \n",
"226 Strcutures sans VIP \n",
"67 Agi pour buren \n",
"144 Invitation à l'exposition Palestine LANG \n",
"62 Acheteurs individuels de l'expo Juifs d'orient statique \n",
"61 Acheteurs individuels de l'expo Juifs d'orient \n",
"95 Cible arabic Sound system \n",
"244 VIP STATIQUE \n",
"245 VIP Téléchargement \n",
"102 Cible scolaire 2022 \n",
"90 Cible Maroc \n",
"91 Cible Maroc \n",
"41 26mai-2023-Structures-invit-palestine \n",
"393 liste_contacts_agi_2021_02_16_ \n",
"450 sb-fichier-eudonet-ok-18-05-21 \n",
"4 04_11_22_eudonet \n",
"215 SB-18-05-VIP-eudonet \n",
"175 Liste vernissage \n",
"235 Tous les relais \n",
"252 Visiteurs expo pour questionnaires \n",
"223 Scolaires - Actions Educatives 24/11/2021 \n",
"92 Cible Musique Judeo-arabe \n",
"13 15-09-2023-Cible-Palestine \n",
"162 LIVE2022_Intérêts Rencontres, débats et conférences \n",
"282 cible photo \n",
"38 26-MAI_STRUCTURE-2023-OK \n",
"5 07-12-20-Relais-invitatation-divas \n",
"410 old_Amis de l'IMA \n",
"222 Scolaires - Actions Educatives 24/01/2023 \n",
"198 PALESTINE \n",
"249 Vignes et tilleuls \n",
"39 26-mai-11H10-relais \n",
"110 Contacts Librairie \n",
"194 Origine - Inscription manuelle \n",
"196 Origine - QR code \n",
"59 Acheteurs Daoud Depardon \n",
"473 événements autour de Habibi \n",
"20 17-04-21-nuits-ducinema \n",
"6 07-12-20-liste-invites-presentation-divas \n",
"278 catégorie Cinéma \n",
"250 Visiteurs Palestine \n",
"248 Vernissages Algérie \n",
"242 VIP Algérie \n",
"445 save the date invités vernissage \n",
"205 Profil Contact Expo Divas - Juillet \n",
"19 17-04-2021-autres-liste-statique-cinema \n",
"279 catégorie rencontres et débats \n",
"325 fichier-dolist-05-12-20-relais \n",
"177 MAILING SAVE THE DATE PARFUMS \n",
"96 Cible enfants/ famille janvier 23 \n",
"129 Globale vernissage Samarcande \n",
"251 Visiteurs Palestine mi-expo questionnaire \n",
"88 Cible LIBAN \n",
"21 18-11-2021-liste i,augurationdu23-18h3à \n",
"431 professionnels de l'écologie \n",
"142 Intérêt écologie (pro) \n",
"323 fichier pro écologie \n",
"\n",
" customer_id cumulative_customers \n",
"232 0.999991 0.069325 \n",
"76 0.999991 0.138650 \n",
"191 0.999991 0.207974 \n",
"128 0.955488 0.274214 \n",
"112 0.929969 0.338684 \n",
"98 0.636246 0.382792 \n",
"23 0.630389 0.426494 \n",
"64 0.627917 0.470025 \n",
"171 0.582183 0.510385 \n",
"126 0.532831 0.547324 \n",
"78 0.449371 0.578477 \n",
"234 0.412546 0.607076 \n",
"192 0.365057 0.632384 \n",
"170 0.325482 0.654948 \n",
"10 0.193833 0.668386 \n",
"195 0.192452 0.681728 \n",
"158 0.173550 0.693759 \n",
"414 0.166505 0.705302 \n",
"415 0.163572 0.716642 \n",
"100 0.130372 0.725680 \n",
"12 0.128804 0.734609 \n",
"73 0.109123 0.742174 \n",
"137 0.108917 0.749725 \n",
"274 0.105407 0.757032 \n",
"208 0.083821 0.762843 \n",
"159 0.082858 0.768587 \n",
"99 0.080312 0.774155 \n",
"213 0.079205 0.779646 \n",
"101 0.076743 0.784966 \n",
"189 0.076200 0.790249 \n",
"349 0.072260 0.795258 \n",
"141 0.072254 0.800267 \n",
"87 0.070337 0.805143 \n",
"93 0.069296 0.809947 \n",
"140 0.068716 0.814711 \n",
"280 0.068585 0.819466 \n",
"174 0.067404 0.824139 \n",
"138 0.066082 0.828720 \n",
"18 0.065166 0.833237 \n",
"204 0.063401 0.837633 \n",
"265 0.061647 0.841906 \n",
"203 0.061138 0.846145 \n",
"233 0.057407 0.850125 \n",
"243 0.053682 0.853846 \n",
"226 0.053396 0.857548 \n",
"67 0.051575 0.861123 \n",
"144 0.051092 0.864665 \n",
"62 0.046526 0.867891 \n",
"61 0.046513 0.871115 \n",
"95 0.046164 0.874316 \n",
"244 0.041158 0.877169 \n",
"245 0.040737 0.879993 \n",
"102 0.040313 0.882788 \n",
"90 0.039827 0.885549 \n",
"91 0.039827 0.888310 \n",
"41 0.039188 0.891027 \n",
"393 0.033618 0.893357 \n",
"450 0.032056 0.895579 \n",
"4 0.031857 0.897788 \n",
"215 0.031857 0.899996 \n",
"175 0.031364 0.902171 \n",
"235 0.031090 0.904326 \n",
"252 0.029930 0.906401 \n",
"223 0.029871 0.908472 \n",
"92 0.029266 0.910501 \n",
"13 0.028531 0.912478 \n",
"162 0.026928 0.914345 \n",
"282 0.026056 0.916152 \n",
"38 0.025495 0.917919 \n",
"5 0.024909 0.919646 \n",
"410 0.023160 0.921251 \n",
"222 0.022724 0.922827 \n",
"198 0.020903 0.924276 \n",
"249 0.020439 0.925693 \n",
"39 0.019578 0.927050 \n",
"110 0.019114 0.928375 \n",
"194 0.018307 0.929644 \n",
"196 0.018294 0.930913 \n",
"59 0.018232 0.932176 \n",
"473 0.017755 0.933407 \n",
"20 0.017484 0.934619 \n",
"6 0.017337 0.935821 \n",
"278 0.016748 0.936982 \n",
"250 0.016302 0.938113 \n",
"248 0.014703 0.939132 \n",
"242 0.014666 0.940149 \n",
"445 0.014432 0.941149 \n",
"205 0.014282 0.942139 \n",
"19 0.013862 0.943100 \n",
"279 0.013285 0.944021 \n",
"325 0.013163 0.944934 \n",
"177 0.013154 0.945846 \n",
"96 0.012515 0.946713 \n",
"129 0.011078 0.947481 \n",
"251 0.010735 0.948225 \n",
"88 0.010003 0.948919 \n",
"21 0.009975 0.949610 \n",
"431 0.009769 0.950288 \n",
"142 0.009769 0.950965 \n",
"323 0.009769 0.951642 "
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print_main_target('4', 100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40fe3676",
"metadata": {},
"outputs": [],
"source": [
"print_main_target('101', 100)"
]
},
{
"cell_type": "markdown",
"id": "605cced5-052f-4a99-ac26-020c5d2ab633",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## KPI sur tags"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "916c3e2b-04d3-4877-b894-8f26f10d926e",
"metadata": {},
"outputs": [],
"source": [
"customersplus = load_dataset_2(\"4\", \"customersplus\")[['id', 'structure_id']]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "46847b24-15a4-464e-969f-f16ed3653f1f",
"metadata": {},
"outputs": [],
"source": [
"structure_tag_mappings = load_dataset_2('4', \"structure_tag_mappings\")[['structure_id', 'tag_id']]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "3c10c69d-735f-453e-96bf-750697d965d0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"19427"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"customersplus[customersplus['structure_id'].notna()]['structure_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "9b0e77b3-5f16-4484-9564-7d3826583418",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"33645"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(customersplus[customersplus['structure_id'].notna()])"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "dfa27722-37f9-435a-8221-8aa6f9a4a107",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3431"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"structure_tag_mappings['structure_id'].nunique()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "2daabdd5-31e3-4918-9856-9bbc30cde602",
"metadata": {},
"outputs": [],
"source": [
"def tags_information(tenant_id, first_tags):\n",
"\n",
" customersplus = load_dataset_2(tenant_id, \"customersplus\")[['id', 'structure_id']]\n",
" customersplus.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
" tags = load_dataset_2(tenant_id, \"tags\")[['id', 'name']]\n",
" tags.rename(columns = {'id' : 'tag_id', 'name' : 'tag_name'}, inplace = True)\n",
" structure_tag_mappings = load_dataset_2(tenant_id, \"structure_tag_mappings\")[['structure_id', 'tag_id']]\n",
" \n",
" customer_tags = pd.merge(customersplus, structure_tag_mappings, on = 'structure_id', how = 'left')\n",
" customer_tags = pd.merge(customer_tags, tags, on = 'tag_id', how = 'inner')\n",
" \n",
" nb_customers_with_tag = customer_tags['customer_id'].nunique()\n",
" \n",
" print('Nombre de client avec tag : ', nb_customers_with_tag)\n",
" print('Proportion de clients avec tags : ', nb_customers_with_tag/len(customersplus))\n",
" print('Moyenne de tags par client : ', len(customer_tags)/nb_customers_with_tag)\n",
" \n",
" info = customer_tags.groupby(['tag_id', 'tag_name'])['customer_id'].count().reset_index().sort_values('customer_id', ascending = False).head(first_tags)\n",
"\n",
" return info"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "0b9f5f71-a927-4cc8-bb0c-9538e28d3553",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 13320\n",
"Proportion de clients avec tags : 0.0877089012682233\n",
"Moyenne de tags par client : 2.1725975975975977\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>11029.0</td>\n",
" <td>individuels</td>\n",
" <td>3270</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>11047.0</td>\n",
" <td>groupes scolaires</td>\n",
" <td>2417</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>11033.0</td>\n",
" <td>association</td>\n",
" <td>2308</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>11028.0</td>\n",
" <td>structures culturelles</td>\n",
" <td>2011</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>11051.0</td>\n",
" <td>etablissement ens scolaire</td>\n",
" <td>1732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>11036.0</td>\n",
" <td>champ social</td>\n",
" <td>1603</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>11072.0</td>\n",
" <td>etab d'enseignement</td>\n",
" <td>1036</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>11043.0</td>\n",
" <td>etablissement public</td>\n",
" <td>935</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>11035.0</td>\n",
" <td>organisme de tourisme</td>\n",
" <td>892</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>11045.0</td>\n",
" <td>centre de loisirs</td>\n",
" <td>864</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>11073.0</td>\n",
" <td>musée, site &amp; fondation</td>\n",
" <td>786</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>11053.0</td>\n",
" <td>groupes etudiants</td>\n",
" <td>758</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11032.0</td>\n",
" <td>entreprise</td>\n",
" <td>750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11039.0</td>\n",
" <td>etablissement d'enseignement</td>\n",
" <td>741</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>11034.0</td>\n",
" <td>asso. culturelle</td>\n",
" <td>692</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>11044.0</td>\n",
" <td>administration et collectivité</td>\n",
" <td>676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>11046.0</td>\n",
" <td>tour opérateur</td>\n",
" <td>642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>11048.0</td>\n",
" <td>entreprises</td>\n",
" <td>515</td>\n",
" </tr>\n",
" <tr>\n",
" <th>72</th>\n",
" <td>11619.0</td>\n",
" <td>structures culturelles;musée, site &amp; fondation</td>\n",
" <td>427</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>11037.0</td>\n",
" <td>handicap</td>\n",
" <td>426</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"1 11029.0 individuels 3270\n",
"18 11047.0 groupes scolaires 2417\n",
"4 11033.0 association 2308\n",
"0 11028.0 structures culturelles 2011\n",
"22 11051.0 etablissement ens scolaire 1732\n",
"7 11036.0 champ social 1603\n",
"43 11072.0 etab d'enseignement 1036\n",
"14 11043.0 etablissement public 935\n",
"6 11035.0 organisme de tourisme 892\n",
"16 11045.0 centre de loisirs 864\n",
"44 11073.0 musée, site & fondation 786\n",
"24 11053.0 groupes etudiants 758\n",
"3 11032.0 entreprise 750\n",
"10 11039.0 etablissement d'enseignement 741\n",
"5 11034.0 asso. culturelle 692\n",
"15 11044.0 administration et collectivité 676\n",
"17 11046.0 tour opérateur 642\n",
"19 11048.0 entreprises 515\n",
"72 11619.0 structures culturelles;musée, site & fondation 427\n",
"8 11037.0 handicap 426"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"1\", 20)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "bd5bef41-1774-4601-86b5-b7c1aea8f1d2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 5953\n",
"Proportion de clients avec tags : 0.021598421025897787\n",
"Moyenne de tags par client : 1.0\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>training-sb-ax</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"0 1.0 training-sb-ax 5"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"2\", 20)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "7c2dc3e6-1418-44db-a8c0-4a9d59ec5232",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>training-sb-ax</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id name\n",
"0 1 training-sb-ax\n",
"1 2 NaN"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"load_dataset_2(\"2\", \"tags\")[['id', 'name']]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "c7b2c670-7122-4f67-b1aa-8c80a10f16d8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 23659\n",
"Proportion de clients avec tags : 0.09207484608139978\n",
"Moyenne de tags par client : 3.0620482691576143\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>164</th>\n",
" <td>44539.0</td>\n",
" <td>*individuel/particulier</td>\n",
" <td>13148</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>26926.0</td>\n",
" <td>ce</td>\n",
" <td>3216</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>6995.0</td>\n",
" <td>college</td>\n",
" <td>2126</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>7028.0</td>\n",
" <td>lycee</td>\n",
" <td>1577</td>\n",
" </tr>\n",
" <tr>\n",
" <th>154</th>\n",
" <td>44524.0</td>\n",
" <td>iraiser</td>\n",
" <td>1453</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6714.0</td>\n",
" <td>ecole primaire</td>\n",
" <td>1200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>155</th>\n",
" <td>44525.0</td>\n",
" <td>bp</td>\n",
" <td>1094</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>7024.0</td>\n",
" <td>centre de loisirs</td>\n",
" <td>1080</td>\n",
" </tr>\n",
" <tr>\n",
" <th>153</th>\n",
" <td>44515.0</td>\n",
" <td>entreprise</td>\n",
" <td>998</td>\n",
" </tr>\n",
" <tr>\n",
" <th>126</th>\n",
" <td>44039.0</td>\n",
" <td>ca fondation d'aumale</td>\n",
" <td>891</td>\n",
" </tr>\n",
" <tr>\n",
" <th>152</th>\n",
" <td>44514.0</td>\n",
" <td>particulier</td>\n",
" <td>838</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>43663.0</td>\n",
" <td>président</td>\n",
" <td>816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>76</th>\n",
" <td>43703.0</td>\n",
" <td>directeur</td>\n",
" <td>812</td>\n",
" </tr>\n",
" <tr>\n",
" <th>158</th>\n",
" <td>44528.0</td>\n",
" <td>dc</td>\n",
" <td>807</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54</th>\n",
" <td>43681.0</td>\n",
" <td>présidente</td>\n",
" <td>805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>149</th>\n",
" <td>44511.0</td>\n",
" <td>entreprise (financier)</td>\n",
" <td>805</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>43718.0</td>\n",
" <td>conseillère régionale déléguée titulaire</td>\n",
" <td>804</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>43667.0</td>\n",
" <td>directeur de l'agence</td>\n",
" <td>801</td>\n",
" </tr>\n",
" <tr>\n",
" <th>78</th>\n",
" <td>43705.0</td>\n",
" <td>sous-préfet</td>\n",
" <td>798</td>\n",
" </tr>\n",
" <tr>\n",
" <th>100</th>\n",
" <td>43728.0</td>\n",
" <td>chargée de mission paysage</td>\n",
" <td>797</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"164 44539.0 *individuel/particulier 13148\n",
"30 26926.0 ce 3216\n",
"14 6995.0 college 2126\n",
"16 7028.0 lycee 1577\n",
"154 44524.0 iraiser 1453\n",
"2 6714.0 ecole primaire 1200\n",
"155 44525.0 bp 1094\n",
"15 7024.0 centre de loisirs 1080\n",
"153 44515.0 entreprise 998\n",
"126 44039.0 ca fondation d'aumale 891\n",
"152 44514.0 particulier 838\n",
"36 43663.0 président 816\n",
"76 43703.0 directeur 812\n",
"158 44528.0 dc 807\n",
"54 43681.0 présidente 805\n",
"149 44511.0 entreprise (financier) 805\n",
"90 43718.0 conseillère régionale déléguée titulaire 804\n",
"40 43667.0 directeur de l'agence 801\n",
"78 43705.0 sous-préfet 798\n",
"100 43728.0 chargée de mission paysage 797"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"3\", 20)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "76639995-252d-4a58-83d8-c0c00900c3a9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 10495\n",
"Proportion de clients avec tags : 0.03271416949025744\n",
"Moyenne de tags par client : 5.298427822772749\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>147</th>\n",
" <td>298.0</td>\n",
" <td>jhima</td>\n",
" <td>4219</td>\n",
" </tr>\n",
" <tr>\n",
" <th>146</th>\n",
" <td>297.0</td>\n",
" <td>colloque algérie</td>\n",
" <td>3851</td>\n",
" </tr>\n",
" <tr>\n",
" <th>142</th>\n",
" <td>292.0</td>\n",
" <td>i&amp;ma</td>\n",
" <td>3826</td>\n",
" </tr>\n",
" <tr>\n",
" <th>154</th>\n",
" <td>305.0</td>\n",
" <td>mardis de la philo</td>\n",
" <td>3674</td>\n",
" </tr>\n",
" <tr>\n",
" <th>150</th>\n",
" <td>301.0</td>\n",
" <td>le grand continant</td>\n",
" <td>3670</td>\n",
" </tr>\n",
" <tr>\n",
" <th>144</th>\n",
" <td>295.0</td>\n",
" <td>araborama</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>155</th>\n",
" <td>306.0</td>\n",
" <td>marie descourtieux</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>145</th>\n",
" <td>296.0</td>\n",
" <td>c'était la guerre d'algérie</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>141</th>\n",
" <td>291.0</td>\n",
" <td>araborama 3</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>102</th>\n",
" <td>198.0</td>\n",
" <td>association de collectivités territoriales spé...</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>143</th>\n",
" <td>294.0</td>\n",
" <td>arabofolies</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>103</th>\n",
" <td>199.0</td>\n",
" <td>rassemble les 11 000 élus de toute la france a...</td>\n",
" <td>3669</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>50.0</td>\n",
" <td>association</td>\n",
" <td>463</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>54.0</td>\n",
" <td>collège</td>\n",
" <td>446</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49.0</td>\n",
" <td>ecole</td>\n",
" <td>374</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>55.0</td>\n",
" <td>lycée</td>\n",
" <td>275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>53.0</td>\n",
" <td>centre social</td>\n",
" <td>200</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>130.0</td>\n",
" <td>cultures et arts</td>\n",
" <td>141</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>51.0</td>\n",
" <td>mairie</td>\n",
" <td>136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>64.0</td>\n",
" <td>formation_ima_ax</td>\n",
" <td>87</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"147 298.0 jhima 4219\n",
"146 297.0 colloque algérie 3851\n",
"142 292.0 i&ma 3826\n",
"154 305.0 mardis de la philo 3674\n",
"150 301.0 le grand continant 3670\n",
"144 295.0 araborama 3669\n",
"155 306.0 marie descourtieux 3669\n",
"145 296.0 c'était la guerre d'algérie 3669\n",
"141 291.0 araborama 3 3669\n",
"102 198.0 association de collectivités territoriales spé... 3669\n",
"143 294.0 arabofolies 3669\n",
"103 199.0 rassemble les 11 000 élus de toute la france a... 3669\n",
"2 50.0 association 463\n",
"6 54.0 collège 446\n",
"1 49.0 ecole 374\n",
"7 55.0 lycée 275\n",
"5 53.0 centre social 200\n",
"53 130.0 cultures et arts 141\n",
"3 51.0 mairie 136\n",
"13 64.0 formation_ima_ax 87"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"4\", 20)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "07e91791-d4d4-42b1-ac18-22d3b0b9f7bd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Nombre de client avec tag : 532342\n",
"Proportion de clients avec tags : 0.18660686931118298\n",
"Moyenne de tags par client : 24.114082676174338\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tag_id</th>\n",
" <th>tag_name</th>\n",
" <th>customer_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>349.0</td>\n",
" <td>clients internet</td>\n",
" <td>517491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>356.0</td>\n",
" <td>associations / clubs</td>\n",
" <td>495520</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>10.0</td>\n",
" <td>agence de voyages</td>\n",
" <td>493774</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>410.0</td>\n",
" <td>guides conférenciers</td>\n",
" <td>493378</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>360.0</td>\n",
" <td>groupe amis ou famille</td>\n",
" <td>493021</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>354.0</td>\n",
" <td>ce / entreprises</td>\n",
" <td>493016</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>17.0</td>\n",
" <td>association/club</td>\n",
" <td>493008</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.0</td>\n",
" <td>c.e. / entreprise</td>\n",
" <td>492656</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>11.0</td>\n",
" <td>college</td>\n",
" <td>492552</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>69.0</td>\n",
" <td>tour operator</td>\n",
" <td>492549</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>9.0</td>\n",
" <td>ecole primaire</td>\n",
" <td>492540</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>379.0</td>\n",
" <td>parent goûter anniversaire</td>\n",
" <td>492468</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>364.0</td>\n",
" <td>institutions</td>\n",
" <td>492364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>6.0</td>\n",
" <td>institution</td>\n",
" <td>492321</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>186.0</td>\n",
" <td>autocaristes</td>\n",
" <td>492153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>13.0</td>\n",
" <td>enseignement superieur</td>\n",
" <td>492131</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>359.0</td>\n",
" <td>hotels / campings</td>\n",
" <td>492078</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>7186.0</td>\n",
" <td>individuel</td>\n",
" <td>491913</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7.0</td>\n",
" <td>groupe amis / famille</td>\n",
" <td>491900</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2.0</td>\n",
" <td>client internet</td>\n",
" <td>491896</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" tag_id tag_name customer_id\n",
"20 349.0 clients internet 517491\n",
"24 356.0 associations / clubs 495520\n",
"5 10.0 agence de voyages 493774\n",
"32 410.0 guides conférenciers 493378\n",
"26 360.0 groupe amis ou famille 493021\n",
"23 354.0 ce / entreprises 493016\n",
"8 17.0 association/club 493008\n",
"1 3.0 c.e. / entreprise 492656\n",
"6 11.0 college 492552\n",
"13 69.0 tour operator 492549\n",
"4 9.0 ecole primaire 492540\n",
"31 379.0 parent goûter anniversaire 492468\n",
"30 364.0 institutions 492364\n",
"2 6.0 institution 492321\n",
"18 186.0 autocaristes 492153\n",
"7 13.0 enseignement superieur 492131\n",
"25 359.0 hotels / campings 492078\n",
"42 7186.0 individuel 491913\n",
"3 7.0 groupe amis / famille 491900\n",
"0 2.0 client internet 491896"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_information(\"101\", 20)"
]
},
{
"cell_type": "markdown",
"id": "87d131cd-ead0-4ef4-a8ee-b09022d08ffa",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## KPI product"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "26582be9-cfd1-48ea-a0a7-31101fdeb9d1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_1/products_purchased_reduced.csv\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ticket_id</th>\n",
" <th>customer_id</th>\n",
" <th>purchase_id</th>\n",
" <th>event_type_id</th>\n",
" <th>supplier_name</th>\n",
" <th>purchase_date</th>\n",
" <th>amount</th>\n",
" <th>is_full_price</th>\n",
" <th>name_event_types</th>\n",
" <th>name_facilities</th>\n",
" <th>name_categories</th>\n",
" <th>name_events</th>\n",
" <th>name_seasons</th>\n",
" <th>start_date_time</th>\n",
" <th>end_date_time</th>\n",
" <th>open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13070859</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>8.0</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" <td>2018-12-31 14:15:00+01:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>13070860</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" <td>2018-12-31 14:15:00+01:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>13070861</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" <td>2018-12-31 14:15:00+01:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13070862</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" <td>2018-12-31 14:15:00+01:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13070863</td>\n",
" <td>48187</td>\n",
" <td>5107462</td>\n",
" <td>4</td>\n",
" <td>vente en ligne</td>\n",
" <td>2018-12-28 14:47:50+00:00</td>\n",
" <td>4.0</td>\n",
" <td>False</td>\n",
" <td>spectacle vivant</td>\n",
" <td>mucem</td>\n",
" <td>indiv prog enfant</td>\n",
" <td>l'école des magiciens</td>\n",
" <td>2018</td>\n",
" <td>2018-12-31 14:15:00+01:00</td>\n",
" <td>1901-01-01 00:09:21+00:09</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ticket_id customer_id purchase_id event_type_id supplier_name \\\n",
"0 13070859 48187 5107462 4 vente en ligne \n",
"1 13070860 48187 5107462 4 vente en ligne \n",
"2 13070861 48187 5107462 4 vente en ligne \n",
"3 13070862 48187 5107462 4 vente en ligne \n",
"4 13070863 48187 5107462 4 vente en ligne \n",
"\n",
" purchase_date amount is_full_price name_event_types \\\n",
"0 2018-12-28 14:47:50+00:00 8.0 False spectacle vivant \n",
"1 2018-12-28 14:47:50+00:00 4.0 False spectacle vivant \n",
"2 2018-12-28 14:47:50+00:00 4.0 False spectacle vivant \n",
"3 2018-12-28 14:47:50+00:00 4.0 False spectacle vivant \n",
"4 2018-12-28 14:47:50+00:00 4.0 False spectacle vivant \n",
"\n",
" name_facilities name_categories name_events name_seasons \\\n",
"0 mucem indiv prog enfant l'école des magiciens 2018 \n",
"1 mucem indiv prog enfant l'école des magiciens 2018 \n",
"2 mucem indiv prog enfant l'école des magiciens 2018 \n",
"3 mucem indiv prog enfant l'école des magiciens 2018 \n",
"4 mucem indiv prog enfant l'école des magiciens 2018 \n",
"\n",
" start_date_time end_date_time open \n",
"0 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True \n",
"1 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True \n",
"2 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True \n",
"3 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True \n",
"4 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tenant_id = \"1\"\n",
"\n",
"df_product = display_databases(tenant_id, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
"\n",
"df_product.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "533bf499-dd56-4d29-b261-ca1e4928c9c7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name_event_types</th>\n",
" <th>name_events</th>\n",
" <th>ticket_id</th>\n",
" <th>prop_tickets</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>118</th>\n",
" <td>offre muséale groupe</td>\n",
" <td>visite générale du mucem (1h30)</td>\n",
" <td>43814</td>\n",
" <td>0.024</td>\n",
" </tr>\n",
" <tr>\n",
" <th>212</th>\n",
" <td>offre muséale individuel</td>\n",
" <td>visite autonome scolaires (2h00)</td>\n",
" <td>34423</td>\n",
" <td>0.019</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68</th>\n",
" <td>offre muséale groupe</td>\n",
" <td>visite autonome exposition (1h30)</td>\n",
" <td>26489</td>\n",
" <td>0.015</td>\n",
" </tr>\n",
" <tr>\n",
" <th>210</th>\n",
" <td>offre muséale individuel</td>\n",
" <td>visite autonome adultes (2h00)</td>\n",
" <td>22065</td>\n",
" <td>0.012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>160</th>\n",
" <td>offre muséale groupe</td>\n",
" <td>visites des exterieurs scolaires</td>\n",
" <td>15595</td>\n",
" <td>0.009</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>364</th>\n",
" <td>spectacle vivant</td>\n",
" <td>kay ! lettres à un poète disparu</td>\n",
" <td>1</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>443</th>\n",
" <td>spectacle vivant</td>\n",
" <td>mauvais genre</td>\n",
" <td>1</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>375</th>\n",
" <td>spectacle vivant</td>\n",
" <td>la madre que parió a la música</td>\n",
" <td>1</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>260</th>\n",
" <td>spectacle vivant</td>\n",
" <td>ali a les yeux bleus (dès 12 ans)</td>\n",
" <td>1</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>484</th>\n",
" <td>spectacle vivant</td>\n",
" <td>rengaine (dès 12 ans)</td>\n",
" <td>1</td>\n",
" <td>0.000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>544 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" name_event_types name_events ticket_id \\\n",
"118 offre muséale groupe visite générale du mucem (1h30) 43814 \n",
"212 offre muséale individuel visite autonome scolaires (2h00) 34423 \n",
"68 offre muséale groupe visite autonome exposition (1h30) 26489 \n",
"210 offre muséale individuel visite autonome adultes (2h00) 22065 \n",
"160 offre muséale groupe visites des exterieurs scolaires 15595 \n",
".. ... ... ... \n",
"364 spectacle vivant kay ! lettres à un poète disparu 1 \n",
"443 spectacle vivant mauvais genre 1 \n",
"375 spectacle vivant la madre que parió a la música 1 \n",
"260 spectacle vivant ali a les yeux bleus (dès 12 ans) 1 \n",
"484 spectacle vivant rengaine (dès 12 ans) 1 \n",
"\n",
" prop_tickets \n",
"118 0.024 \n",
"212 0.019 \n",
"68 0.015 \n",
"210 0.012 \n",
"160 0.009 \n",
".. ... \n",
"364 0.000 \n",
"443 0.000 \n",
"375 0.000 \n",
"260 0.000 \n",
"484 0.000 \n",
"\n",
"[544 rows x 4 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb_tickets_per_events = df_product.groupby(['name_event_types', 'name_events'])['ticket_id'].count().reset_index().sort_values('ticket_id', ascending = False)\n",
"nb_tickets_per_events['prop_tickets'] = round(nb_tickets_per_events['ticket_id']/len(df_product), 3)\n",
"nb_tickets_per_events"
]
},
{
"cell_type": "markdown",
"id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Fin travail 25/02"
]
},
{
"cell_type": "markdown",
"id": "c437eaec",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
"id": "a1c1fc39",
"metadata": {},
"source": [
"## Chargement données"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "66f8c17b",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "c08e6798",
"metadata": {},
"outputs": [],
"source": [
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_database_filtered)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "675f518d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<<<<<<< local <modified: >\n",
"/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n",
"=======\n",
"/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n",
">>>>>>> remote <modified: >\n"
]
}
],
"source": [
"# loop to create dataframes from liste\n",
"files_path = liste_database\n",
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
{
"cell_type": "markdown",
"id": "e855f403",
"metadata": {},
"source": [
"## customersplus.csv"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "91a8f8c4",
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.info())"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "2fda171d",
"metadata": {},
"outputs": [],
"source": [
"def info_colonnes_dataframe(df):\n",
" # Créer une liste pour stocker les informations sur chaque colonne\n",
" infos_colonnes = []\n",
"\n",
" # Parcourir les colonnes du DataFrame\n",
" for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n",
" # Calculer le taux de valeurs manquantes\n",
" taux_na = serie.isna().mean() * 100\n",
"\n",
" # Ajouter les informations à la liste\n",
" infos_colonnes.append({\n",
" 'Nom_colonne': nom_colonne,\n",
" 'Type_colonne': str(serie.dtype),\n",
" 'Taux_NA': taux_na\n",
" })\n",
"\n",
" # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
" df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
"\n",
" return df_infos_colonnes"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "205eeeab",
"metadata": {},
"outputs": [],
"source": [
"def cleaning_date(df, column_name):\n",
" \"\"\"\n",
" Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
"\n",
" Parameters:\n",
" - df: DataFrame\n",
" Le DataFrame contenant la colonne à nettoyer.\n",
" - column_name: str\n",
" Le nom de la colonne à nettoyer.\n",
"\n",
" Returns:\n",
" - DataFrame\n",
" Le DataFrame modifié avec la colonne nettoyée.\n",
" \"\"\"\n",
" df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "634282c5",
"metadata": {},
"outputs": [],
"source": [
"a = info_colonnes_dataframe(df1_customersplus)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "0e8d4133",
"metadata": {},
"outputs": [],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "1268ad5a",
"metadata": {},
"outputs": [],
"source": [
"a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "bd41dc80",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_customersplus_clean = df1_customersplus.copy()\n",
"\n",
"cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
"cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
"\n",
"df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
"df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "64d0f76b",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## tickets.csv"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "7e683711",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e7b9a52e",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "568280e8",
"metadata": {},
"outputs": [],
"source": [
"df1_tickets.isna().sum()/len(df1_tickets)*100"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "29ecec90",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
"df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "22bb5de4",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## suppliers.csv"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "6a9a91f4",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "bab4758a",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers.info()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "b5fff251",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers.isna().sum()/len(df1_suppliers)*100"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8b09e2a3",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
"df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "ecee7cdc",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_clean"
]
},
{
"cell_type": "markdown",
"id": "c8e6e69b",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## type_ofs.csv"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1a6cff1f",
"metadata": {},
"outputs": [],
"source": [
"df1_type_ofs"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "93630b41",
"metadata": {},
"outputs": [],
"source": [
"df1_type_ofs.info()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "4f94481a",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
"df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "1b2811e2",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## purchases.csv"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "2455d2e1",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df1_purchases"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "5f9a159d",
"metadata": {},
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "db201bf7",
"metadata": {},
"outputs": [],
"source": [
"# Nettoyage purchase_date\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
"df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "bd436fca",
"metadata": {},
"outputs": [],
"source": [
"df1_purchases.info()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "83435862",
"metadata": {},
"outputs": [],
"source": [
"# Selection des variables\n",
"df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
]
},
{
"cell_type": "markdown",
"id": "f210e730",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Fusion de l'ensemble des données billétiques"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "1f8b3aa7",
"metadata": {},
"outputs": [],
"source": [
"# Fusion avec fournisseurs\n",
"df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec type de tickets\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
"\n",
"# Fusion avec achats\n",
"df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
"df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "83a4d021",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "markdown",
"id": "56e6ebd1",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Utilisation de fonctions"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "88fcde4b",
"metadata": {},
"outputs": [],
"source": [
"# Créer un DataFrame exemple\n",
"df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
"\n",
"# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
"df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
"df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
"\n",
"test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
"\n",
"test.info()"
]
},
{
"cell_type": "markdown",
"id": "818f69db",
"metadata": {},
"source": [
"## Nettoyage, selection et fusion"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "c9654eda",
"metadata": {},
"outputs": [],
"source": [
"df1_ticket_information"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "7f2b620c",
"metadata": {},
"outputs": [],
"source": [
"df1_ticket_information.info()"
]
},
{
"cell_type": "markdown",
"id": "637bdb72",
"metadata": {},
"source": [
"# Customer information"
]
},
{
"cell_type": "markdown",
"id": "14c52894",
"metadata": {},
"source": [
"## Target area"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d83abfbf",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
]
}
],
"source": [
"# Target.csv cleaning\n",
"df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
"df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
"\n",
"# target_type cleaning\n",
"df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
"\n",
"#customer_target_mappings cleaning\n",
"df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
"\n",
"# Merge target et target_type\n",
"df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
"df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
"\n",
"# Merge\n",
"df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
"df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "90d71b2c",
"metadata": {},
"outputs": [],
"source": [
"df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
"len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
"\n",
"# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
"df1_targets_test.mean()\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2301de1e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>customer_id</th>\n",
" <th>target_name</th>\n",
" <th>target_type_is_import</th>\n",
" <th>target_type_name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1184824</td>\n",
" <td>645400</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>210571</td>\n",
" <td>2412</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>210572</td>\n",
" <td>4536</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>210573</td>\n",
" <td>6736</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>210574</td>\n",
" <td>38210</td>\n",
" <td>DDCP PROMO Réseau livres</td>\n",
" <td>False</td>\n",
" <td>manual_static_filter</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id customer_id target_name target_type_is_import \\\n",
"0 1184824 645400 DDCP PROMO Réseau livres False \n",
"1 210571 2412 DDCP PROMO Réseau livres False \n",
"2 210572 4536 DDCP PROMO Réseau livres False \n",
"3 210573 6736 DDCP PROMO Réseau livres False \n",
"4 210574 38210 DDCP PROMO Réseau livres False \n",
"\n",
" target_type_name \n",
"0 manual_static_filter \n",
"1 manual_static_filter \n",
"2 manual_static_filter \n",
"3 manual_static_filter \n",
"4 manual_static_filter "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_targets_full.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "75fbc2f7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Catégorisation des target_name\n",
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from nltk.probability import FreqDist\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "55cddf92",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n"
]
}
],
"source": [
"# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"\n",
"# Appliquer le prétraitement à la colonne de texte\n",
"df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
"\n",
"# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
"all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
"\n",
"# Calculer la fréquence des mots\n",
"freq_dist = FreqDist(all_words)\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "7fd98a85",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mots les plus fréquents:\n",
"consentement: 550777\n",
"optin: 463579\n",
"jeune: 155103\n",
"public: 155103\n",
"mediation: 150001\n",
"specialisee: 150001\n",
"b2c: 143432\n",
"optout: 97683\n",
"newsletter: 56022\n",
"(: 46084\n",
"): 46084\n",
"inscrits: 42296\n",
"nl: 42294\n",
"générale: 41037\n",
"generale: 40950\n"
]
}
],
"source": [
"# Affichage des mots les plus fréquents\n",
"print(\"Mots les plus fréquents:\")\n",
"for mot, freq in freq_dist.most_common(15):\n",
" print(f\"{mot}: {freq}\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "cf94bb1d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" texte \\\n",
"0 Le chat noir mange une souris. \n",
"1 Le chien blanc aboie. \n",
"\n",
" texte_preprocessed \n",
"0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n",
"1 [e, h, i, e, b, a, a, b, o, i, e, .] \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
}
],
"source": [
"import pandas as pd\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"\n",
"# Téléchargement des ressources nécessaires\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"\n",
"# Création de la DataFrame d'exemple\n",
"data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
"df = pd.DataFrame(data)\n",
"\n",
"# Fonction pour prétraiter le texte\n",
"def preprocess_text(texte):\n",
" # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
" texte_concat = ' '.join(texte)\n",
" \n",
" # Tokenisation des mots\n",
" tokens = word_tokenize(texte_concat.lower())\n",
" \n",
" # Suppression des mots vides (stopwords)\n",
" stop_words = set(stopwords.words('french'))\n",
" filtered_tokens = [word for word in tokens if word not in stop_words]\n",
" \n",
" # Lemmatisation des mots\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
" \n",
" return lemmatized_tokens\n",
"\n",
"# Appliquer la fonction de prétraitement à la colonne de texte\n",
"df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
"\n",
"# Afficher le résultat\n",
"print(df)\n"
]
},
{
"cell_type": "markdown",
"id": "711d3884",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Campaign area"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "c25b5295",
"metadata": {},
"outputs": [],
"source": [
"# campaign_stats cleaning \n",
"df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
"cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
"cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
"\n",
"# campaigns cleaning\n",
"df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
"cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
"\n",
"# Merge \n",
"df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
"df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "2a3de6a5",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns_full.info()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "3fc1f446",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns_information"
]
},
{
"cell_type": "markdown",
"id": "20e69ee3",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Link area"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "d9cbdbce",
"metadata": {},
"outputs": [],
"source": [
"df1_campaigns"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "c07459f0",
"metadata": {},
"outputs": [],
"source": [
"df1_link_stats"
]
},
{
"cell_type": "markdown",
"id": "80ae4c42",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"## Supplier"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b50b8f95",
"metadata": {},
"outputs": [],
"source": [
"# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
"def suppliers_exploration(suppliers = None) : \n",
" \n",
" # Taux de NaN pour ces colonnes\n",
" label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
" itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
" commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
"\n",
" suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
" 'label_na' : [label_na],\n",
" 'itr_na' : [itr_na],\n",
" 'commission_na' : [commission_na]})\n",
"\n",
" return suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7e292935",
"metadata": {},
"outputs": [],
"source": [
"df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "05b6f2b0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>nb_suppliers</th>\n",
" <th>label_na</th>\n",
" <th>itr_na</th>\n",
" <th>commission_na</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>9</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" <td>100.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" nb_suppliers label_na itr_na commission_na\n",
"0 9 100.0 100.0 100.0"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_suppliers_desc"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c9324d80",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data\"\n",
"liste_folders = fs.ls(BUCKET)\n",
"\n",
"liste_files = []\n",
"for company_folder in liste_folders : \n",
" liste_files.extend(fs.ls(company_folder))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "10304058",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
]
}
],
"source": [
"liste_database_select = ['suppliers']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_suppliers)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "ffa423e5",
"metadata": {},
"outputs": [],
"source": [
"# loop to create dataframes from file 2\n",
"def database_loading(database_name = None):\n",
" files_path = database_name\n",
" \n",
" client_number = files_path.split(\"/\")[1]\n",
" df_prefix = \"df\" + str(client_number) + \"_\"\n",
" \n",
" current_path = files_path\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
"\n",
" return df, client_number"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70bdc88d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 45,
"id": "6a0f567d",
"metadata": {},
"outputs": [],
"source": [
"df_all = pd.DataFrame()\n",
"\n",
"for link in liste_suppliers:\n",
" \n",
" df_supplier, tenant_id = database_loading(link)\n",
" \n",
" df_supplier['tenant_id'] = int(tenant_id)\n",
"\n",
" df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "1522d8cd",
"metadata": {},
"outputs": [],
"source": [
"# df_all[df_all['tenant_id'] == 101]['name'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "b0e42a61",
"metadata": {},
"outputs": [],
"source": [
"liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
"# vad = vente à distance\n",
"df_all['name'] = df_all['name'].fillna('')\n",
"\n",
"df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "d299ae91",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tenant_id\n",
"1 1\n",
"2 1\n",
"3 1\n",
"4 1\n",
"5 1\n",
"6 1\n",
"7 1\n",
"8 1\n",
"9 1\n",
"10 1\n",
"11 1\n",
"12 1\n",
"13 1\n",
"14 1\n",
"101 1\n",
"Name: canal_vente_internet, dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_all.groupby('tenant_id')['canal_vente_internet'].max()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}