{ "cells": [ { "cell_type": "markdown", "id": "5bf5c226", "metadata": {}, "source": [ "# Business Data Challenge - Team 1" ] }, { "cell_type": "code", "execution_count": 1, "id": "b1a5b9d3", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import os\n", "import s3fs\n", "import re\n", "import warnings" ] }, { "cell_type": "markdown", "id": "ecfa2219", "metadata": {}, "source": [ "Configuration de l'accès aux données" ] }, { "cell_type": "code", "execution_count": 2, "id": "1a094277", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { "cell_type": "code", "execution_count": 41, "id": "30d77451-2df6-4c07-8b15-66e0e990ff03", "metadata": {}, "outputs": [], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "\n", "# Import cleaning and merge functions\n", "\n", "exec(open('0_Cleaning_and_merge_functions.py').read())\n", "\n", "exec(open('0_KPI_functions.py').read())\n", "\n", "# Ignore warning\n", "warnings.filterwarnings('ignore')\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39", "metadata": {}, "outputs": [], "source": [ "def load_dataset_2(directory_path, file_name):\n", " \"\"\"\n", " This function loads csv file\n", " \"\"\"\n", " file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n", " with fs.open(file_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in, sep=\",\")\n", "\n", " # drop na :\n", " #df = df.dropna(axis=1, thresh=len(df))\n", " # if identifier in table : delete it\n", " if 'identifier' in df.columns:\n", " df = df.drop(columns = 'identifier')\n", " return df" ] }, { "cell_type": "code", "execution_count": 36, "id": "31ab76f0-fbb1-46f6-b359-97228620c207", "metadata": {}, "outputs": [], "source": [ "def export_in_temporary(df, output_name):\n", " print('Export of dataset :', output_name)\n", " FILE_PATH_OUT_S3 = \"projet-bdc2324-team1/Temporary\" + \"/\" + output_name + '.csv'\n", " with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:\n", " df.to_csv(file_out, index = False)" ] }, { "cell_type": "markdown", "id": "ccf597b0-b459-4ea5-baf0-5ba8c90915e4", "metadata": {}, "source": [ "# Cleaning target area and tags" ] }, { "cell_type": "code", "execution_count": 23, "id": "28316e1d-7892-4506-9d53-0695e71aa7bc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : bdc2324-data/1/1targets.csv\n", "Shape : (287, 5)\n", "Number of columns : 3\n", "Columns : Index(['id', 'target_type_id', 'name'], dtype='object')\n", "File path : bdc2324-data/1/1target_types.csv\n", "Shape : (4, 6)\n", "Number of columns : 4\n", "Columns : Index(['id', 'identifier', 'is_import', 'name'], dtype='object')\n", "File path : bdc2324-data/1/1customer_target_mappings.csv\n", "Shape : (768024, 7)\n", "Number of columns : 5\n", "Columns : Index(['id', 'customer_id', 'target_id', 'name', 'extra_field'], dtype='object')\n" ] } ], "source": [ "target_example = preprocessing_target_area('1')" ] }, { "cell_type": "code", "execution_count": 24, "id": "da467695-ce37-485d-94ab-f1499d56c3a3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcustomer_idtarget_nametarget_type_is_importtarget_type_name
01184824645400DDCP PROMO Réseau livresFalsemanual_static_filter
11184825645400Inscrits NL générale site webFalsemanual_static_filter
21184828645402DDCP PROMO Art contemporainFalsemanual_static_filter
31184829645403DDCP PROMO Art contemporainFalsemanual_static_filter
41295770647301Votre première listeFalsemanual_static_filter
..................
7680192737545666983Inscrits NL générale site webFalsemanual_static_filter
7680202737546666983Votre première listeFalsemanual_static_filter
7680212737575666986Votre première listeFalsemanual_static_filter
7680222737576666987Inscrits NL générale site webFalsemanual_static_filter
7680232737577666987Votre première listeFalsemanual_static_filter
\n", "

768024 rows × 5 columns

\n", "
" ], "text/plain": [ " id customer_id target_name \\\n", "0 1184824 645400 DDCP PROMO Réseau livres \n", "1 1184825 645400 Inscrits NL générale site web \n", "2 1184828 645402 DDCP PROMO Art contemporain \n", "3 1184829 645403 DDCP PROMO Art contemporain \n", "4 1295770 647301 Votre première liste \n", "... ... ... ... \n", "768019 2737545 666983 Inscrits NL générale site web \n", "768020 2737546 666983 Votre première liste \n", "768021 2737575 666986 Votre première liste \n", "768022 2737576 666987 Inscrits NL générale site web \n", "768023 2737577 666987 Votre première liste \n", "\n", " target_type_is_import target_type_name \n", "0 False manual_static_filter \n", "1 False manual_static_filter \n", "2 False manual_static_filter \n", "3 False manual_static_filter \n", "4 False manual_static_filter \n", "... ... ... \n", "768019 False manual_static_filter \n", "768020 False manual_static_filter \n", "768021 False manual_static_filter \n", "768022 False manual_static_filter \n", "768023 False manual_static_filter \n", "\n", "[768024 rows x 5 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_example" ] }, { "cell_type": "code", "execution_count": 44, "id": "fd88e294-e038-4cec-ad94-2bbbc10a4059", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_1/target_information.csv\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customer_idnb_targetsall_targetsall_target_types
0128consentement optin jeune public, DDCP rentrée ...manual_static_filter, manual_static_filter, ma...
127consentement optin jeune public, consentement ...manual_static_filter, manual_static_filter, ma...
2311traversee du port de commerce (gagnant et perd...manual_static_filter, manual_static_filter, ma...
346Arenametrix_bascule tel vers sib, consentement...manual_static_filter, manual_static_filter, ma...
454Arenametrix_bascule tel vers sib, consentement...manual_static_filter, manual_static_filter, ma...
...............
15115412561361consentement optin b2cmanual_static_filter
15115512561371consentement optin b2cmanual_static_filter
15115612561383Inscrits NL jeune public site web, Inscrits NL...manual_static_filter, manual_static_filter, ma...
15115712561393Inscrits NL jeune public site web, Inscrits NL...manual_static_filter, manual_static_filter, ma...
15115812561402DRE MucemLab, consentement optin dremanual_static_filter, manual_static_filter
\n", "

151159 rows × 4 columns

\n", "
" ], "text/plain": [ " customer_id nb_targets \\\n", "0 1 28 \n", "1 2 7 \n", "2 3 11 \n", "3 4 6 \n", "4 5 4 \n", "... ... ... \n", "151154 1256136 1 \n", "151155 1256137 1 \n", "151156 1256138 3 \n", "151157 1256139 3 \n", "151158 1256140 2 \n", "\n", " all_targets \\\n", "0 consentement optin jeune public, DDCP rentrée ... \n", "1 consentement optin jeune public, consentement ... \n", "2 traversee du port de commerce (gagnant et perd... \n", "3 Arenametrix_bascule tel vers sib, consentement... \n", "4 Arenametrix_bascule tel vers sib, consentement... \n", "... ... \n", "151154 consentement optin b2c \n", "151155 consentement optin b2c \n", "151156 Inscrits NL jeune public site web, Inscrits NL... \n", "151157 Inscrits NL jeune public site web, Inscrits NL... \n", "151158 DRE MucemLab, consentement optin dre \n", "\n", " all_target_types \n", "0 manual_static_filter, manual_static_filter, ma... \n", "1 manual_static_filter, manual_static_filter, ma... \n", "2 manual_static_filter, manual_static_filter, ma... \n", "3 manual_static_filter, manual_static_filter, ma... \n", "4 manual_static_filter, manual_static_filter, ma... \n", "... ... \n", "151154 manual_static_filter \n", "151155 manual_static_filter \n", "151156 manual_static_filter, manual_static_filter, ma... \n", "151157 manual_static_filter, manual_static_filter, ma... \n", "151158 manual_static_filter, manual_static_filter \n", "\n", "[151159 rows x 4 columns]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tenant_id = '1'\n", "\n", "def concatenate_names(names):\n", " return ', '.join(names)\n", " \n", "target_example =display_databases(tenant_id, \"target_information\")\n", "\n", "target_example['target_name'] = target_example['target_name'].fillna('').str.lower()\n", "\n", "\n", "target_example['jeune'] = target_example['target_name'].str.contains('|'.join(['jeune', 'pass_culture']), case=False).astype(int)\n", "target_example['optin'] = target_example['target_name'].str.contains('|'.join(['optin' ,'opt-in']), case=False).astype(int)\n", "target_example['optout'] = target_example['target_name'].str.contains('|'.join(['optout']), case=False).astype(int)\n", "target_example['scolaire'] = target_example['target_name'].str.contains('|'.join(['scolaire' , 'enseignant', 'chercheur', 'schulen', 'école']), case=False).astype(int)\n", "target_example['entreprise'] = target_example['target_name'].str.contains('|'.join(['b2b']), case=False).astype(int)\n", "target_example['famille'] = target_example['target_name'].str.contains('|'.join(['famille', 'enfants']), case=False).astype(int)\n", "target_example['newsletter'] = target_example['target_name'].str.contains('|'.join(['nl', 'newsletter']), case=False).astype(int)\n", "\n", "\n", "\n", "target_agg = target_example.groupby('customer_id').agg(\n", " nb_targets=('target_name', 'nunique'), # Utilisation de tuples pour spécifier les noms de colonnes\n", " all_targets=('target_name', concatenate_names),\n", " all_target_types=('target_type_name', concatenate_names)\n", ").reset_index()\n", "target_agg" ] }, { "cell_type": "code", "execution_count": 37, "id": "c75efea3-b5e8-4a7a-bed4-dd64ae9ff9f2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Export of dataset : Target_kpi_concatenate\n" ] } ], "source": [ "export_in_temporary(target_agg, 'Target_kpi_concatenate')" ] }, { "cell_type": "code", "execution_count": null, "id": "cb6f06e6-78de-4b8d-a103-8366eff0493a", "metadata": {}, "outputs": [], "source": [ "v" ] }, { "cell_type": "code", "execution_count": null, "id": "c5e864b1-adad-4267-b956-3f7ef371d677", "metadata": {}, "outputs": [], "source": [ "\n", "def display_covering_time(df, company, datecover):\n", " \"\"\"\n", " This function draws the time coverage of each company\n", " \"\"\"\n", " min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n", " max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n", " datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n", " print(f'Couverture Company {company} : {min_date} - {max_date}')\n", " return datecover\n", "\n", "\n", "def compute_time_intersection(datecover):\n", " \"\"\"\n", " This function returns the time coverage for all companies\n", " \"\"\"\n", " timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n", " intersection = set.intersection(*timestamps_sets)\n", " intersection_list = list(intersection)\n", " formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n", " return sorted(formated_dates)\n", "\n", "\n", "def df_coverage_modelization(sport, coverage_features = 0.7):\n", " \"\"\"\n", " This function returns start_date, end_of_features and final dates\n", " that help to construct train and test datasets\n", " \"\"\"\n", " datecover = {}\n", " for company in sport:\n", " df_products_purchased_reduced = display_input_databases(company, file_name = \"products_purchased_reduced\",\n", " datetime_col = ['purchase_date'])\n", " datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n", " #print(datecover.keys())\n", " dt_coverage = compute_time_intersection(datecover)\n", " start_date = dt_coverage[0]\n", " end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n", " final_date = dt_coverage[-1]\n", " return start_date, end_of_features, final_date\n", " " ] }, { "cell_type": "markdown", "id": "2435097a-95a5-43e1-84d0-7f6b701441ba", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "# Bases non communes : mise à plat" ] }, { "cell_type": "code", "execution_count": 11, "id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384", "metadata": {}, "outputs": [], "source": [ "companies = {'musee' : ['1', '2', '3', '4'],\n", " 'sport': ['5', '6', '7', '8', '9'],\n", " 'musique' : ['10', '11', '12', '13', '14']}\n", "\n", "all_companies = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14']" ] }, { "cell_type": "code", "execution_count": 15, "id": "35ac004f-c191-4f45-a4b1-6d993d9ec38c", "metadata": {}, "outputs": [], "source": [ "companies_databases = pd.DataFrame()\n", "\n", "for i in all_companies:\n", " company_databases = pd.DataFrame({'company_number' : [i]})\n", "\n", " BUCKET = \"bdc2324-data/\"+i\n", " for base in fs.ls(BUCKET):\n", " match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n", " if match:\n", " nom_base = match.group(3)\n", " company_databases[nom_base] = 1\n", "\n", " companies_databases = pd.concat([companies_databases, company_databases])" ] }, { "cell_type": "code", "execution_count": 20, "id": "8986e477-e6c5-4d6c-83b2-2c90c134b599", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
company_numbercampaign_statscampaignscategoriescountriescurrenciescustomer_target_mappingscustomersplusevent_typeseventsfacilitieslink_statspricing_formulasproduct_packsproductsproducts_groupspurchasesrepresentation_category_capacitiesrepresentationsseasonsstructure_tag_mappingssupplierstagstarget_typestargetsticketstype_of_categoriestype_of_pricing_formulastype_ofscontribution_sitescontributionsconsumptionsrepresentation_types
0111111111111111111111.011.01111.01.01.0NaNNaNNaNNaN
0211111111111111111111.011.0111NaNNaNNaN1.01.0NaNNaN
0311111111111111111111.011.0111NaNNaNNaN1.01.01.0NaN
0411111111111111111111.011.0111NaN1.01.01.01.0NaNNaN
051111111111111111111NaN1NaN111NaNNaNNaNNaNNaN1.0NaN
0611111111111111111111.011.0111NaN1.01.0NaNNaN1.0NaN
0711111111111111111111.011.01111.01.01.0NaNNaN1.01.0
081111111111111111111NaN1NaN1111.01.01.0NaNNaNNaNNaN
091111111111111111111NaN1NaN111NaNNaNNaNNaNNaNNaNNaN
0101111111111111111111NaN11.0111NaN1.01.0NaNNaNNaN1.0
01111111111111111111111.011.0111NaNNaNNaNNaNNaNNaNNaN
0121111111111111111111NaN1NaN111NaNNaN1.0NaNNaN1.0NaN
01311111111111111111111.011.0111NaNNaNNaNNaNNaNNaN1.0
0141111111111111111111NaN1NaN1111.01.01.0NaNNaNNaN1.0
\n", "
" ], "text/plain": [ " company_number campaign_stats campaigns categories countries \\\n", "0 1 1 1 1 1 \n", "0 2 1 1 1 1 \n", "0 3 1 1 1 1 \n", "0 4 1 1 1 1 \n", "0 5 1 1 1 1 \n", "0 6 1 1 1 1 \n", "0 7 1 1 1 1 \n", "0 8 1 1 1 1 \n", "0 9 1 1 1 1 \n", "0 10 1 1 1 1 \n", "0 11 1 1 1 1 \n", "0 12 1 1 1 1 \n", "0 13 1 1 1 1 \n", "0 14 1 1 1 1 \n", "\n", " currencies customer_target_mappings customersplus event_types events \\\n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "\n", " facilities link_stats pricing_formulas product_packs products \\\n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "0 1 1 1 1 1 \n", "\n", " products_groups purchases representation_category_capacities \\\n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "0 1 1 1 \n", "\n", " representations seasons structure_tag_mappings suppliers tags \\\n", "0 1 1 1.0 1 1.0 \n", "0 1 1 1.0 1 1.0 \n", "0 1 1 1.0 1 1.0 \n", "0 1 1 1.0 1 1.0 \n", "0 1 1 NaN 1 NaN \n", "0 1 1 1.0 1 1.0 \n", "0 1 1 1.0 1 1.0 \n", "0 1 1 NaN 1 NaN \n", "0 1 1 NaN 1 NaN \n", "0 1 1 NaN 1 1.0 \n", "0 1 1 1.0 1 1.0 \n", "0 1 1 NaN 1 NaN \n", "0 1 1 1.0 1 1.0 \n", "0 1 1 NaN 1 NaN \n", "\n", " target_types targets tickets type_of_categories \\\n", "0 1 1 1 1.0 \n", "0 1 1 1 NaN \n", "0 1 1 1 NaN \n", "0 1 1 1 NaN \n", "0 1 1 1 NaN \n", "0 1 1 1 NaN \n", "0 1 1 1 1.0 \n", "0 1 1 1 1.0 \n", "0 1 1 1 NaN \n", "0 1 1 1 NaN \n", "0 1 1 1 NaN \n", "0 1 1 1 NaN \n", "0 1 1 1 NaN \n", "0 1 1 1 1.0 \n", "\n", " type_of_pricing_formulas type_ofs contribution_sites contributions \\\n", "0 1.0 1.0 NaN NaN \n", "0 NaN NaN 1.0 1.0 \n", "0 NaN NaN 1.0 1.0 \n", "0 1.0 1.0 1.0 1.0 \n", "0 NaN NaN NaN NaN \n", "0 1.0 1.0 NaN NaN \n", "0 1.0 1.0 NaN NaN \n", "0 1.0 1.0 NaN NaN \n", "0 NaN NaN NaN NaN \n", "0 1.0 1.0 NaN NaN \n", "0 NaN NaN NaN NaN \n", "0 NaN 1.0 NaN NaN \n", "0 NaN NaN NaN NaN \n", "0 1.0 1.0 NaN NaN \n", "\n", " consumptions representation_types \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 1.0 NaN \n", "0 NaN NaN \n", "0 1.0 NaN \n", "0 1.0 NaN \n", "0 1.0 1.0 \n", "0 NaN NaN \n", "0 NaN NaN \n", "0 NaN 1.0 \n", "0 NaN NaN \n", "0 1.0 NaN \n", "0 NaN 1.0 \n", "0 NaN 1.0 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option(\"display.max_columns\", None)\n", "companies_databases\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "8fecc3bb-4c03-4144-97c5-615224d9729e", "metadata": {}, "outputs": [], "source": [ "pd.reset_option(\"display.max_columns\")" ] }, { "cell_type": "markdown", "id": "0294ce71-840e-458b-8ffa-cadabbc6da21", "metadata": {}, "source": [ "# Debut Travail 25/02" ] }, { "cell_type": "markdown", "id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Base communes au types Musée" ] }, { "cell_type": "code", "execution_count": null, "id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b", "metadata": {}, "outputs": [], "source": [ "companies['musee']" ] }, { "cell_type": "code", "execution_count": null, "id": "5080f66e-f779-410a-876d-b4fe2795e17e", "metadata": {}, "outputs": [], "source": [ "for i in companies['musique']:\n", " BUCKET = \"bdc2324-data/\"+i\n", " liste_base = []\n", " for base in fs.ls(BUCKET):\n", " match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n", " if match:\n", " nom_base = match.group(3)\n", " liste_base.append(nom_base)\n", " globals()['base_'+i] = liste_base\n" ] }, { "cell_type": "code", "execution_count": null, "id": "abd477e1-7479-4c88-a5aa-f987af3f5b79", "metadata": {}, "outputs": [], "source": [ "# Trouver l'intersection entre les cinq listes\n", "intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n", "\n", "# Convertir le résultat en liste si nécessaire\n", "intersection_liste = list(intersection)\n", "\n", "print(intersection_liste)" ] }, { "cell_type": "code", "execution_count": null, "id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e", "metadata": {}, "outputs": [], "source": [ "# Trouver l'intersection entre les cinq listes\n", "intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n", "\n", "# Convertir le résultat en liste si nécessaire\n", "intersection_liste = list(intersection)\n", "\n", "print(intersection_liste)" ] }, { "cell_type": "code", "execution_count": null, "id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a", "metadata": {}, "outputs": [], "source": [ "len(intersection_liste)" ] }, { "cell_type": "code", "execution_count": null, "id": "7d058b21-a538-4f59-aefb-ef7966f73fdc", "metadata": {}, "outputs": [], "source": [ "df1_tags = load_dataset_2(\"1\", \"tags\")" ] }, { "cell_type": "code", "execution_count": null, "id": "aa441f99-733c-4675-8676-bed4682d3324", "metadata": {}, "outputs": [], "source": [ "df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')" ] }, { "cell_type": "code", "execution_count": null, "id": "6767a750-14a4-4c05-903e-d2f07170825b", "metadata": {}, "outputs": [], "source": [ "df1_customersplus = load_dataset_2(\"1\", \"customersplus\")" ] }, { "cell_type": "code", "execution_count": null, "id": "125e9145-a815-46fd-bdf4-07589508b259", "metadata": {}, "outputs": [], "source": [ "df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)" ] }, { "cell_type": "code", "execution_count": null, "id": "c17a6976-792f-474d-bcff-c89396eddb3f", "metadata": {}, "outputs": [], "source": [ "df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])" ] }, { "cell_type": "code", "execution_count": null, "id": "ecfc155a-cb42-46ec-8da5-33fdcd087355", "metadata": {}, "outputs": [], "source": [ "len(df1_structure_tag_mappings)" ] }, { "cell_type": "code", "execution_count": null, "id": "071410b8-950d-4fcc-b2b9-57415253c286", "metadata": {}, "outputs": [], "source": [ "df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)" ] }, { "cell_type": "code", "execution_count": null, "id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc", "metadata": {}, "outputs": [], "source": [ "?np.sort_values()" ] }, { "cell_type": "code", "execution_count": null, "id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3", "metadata": {}, "outputs": [], "source": [ "def info_colonnes_dataframe(df):\n", " # Créer une liste pour stocker les informations sur chaque colonne\n", " infos_colonnes = []\n", "\n", " # Parcourir les colonnes du DataFrame\n", " for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n", " # Calculer le taux de valeurs manquantes\n", " taux_na = serie.isna().mean() * 100\n", "\n", " # Ajouter les informations à la liste\n", " infos_colonnes.append({\n", " 'Nom_colonne': nom_colonne,\n", " 'Type_colonne': str(serie.dtype),\n", " 'Taux_NA': taux_na\n", " })\n", "\n", " # Créer une nouvelle DataFrame à partir de la liste d'informations\n", " df_infos_colonnes = pd.DataFrame(infos_colonnes)\n", "\n", " return df_infos_colonnes" ] }, { "cell_type": "code", "execution_count": null, "id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8", "metadata": { "scrolled": true }, "outputs": [], "source": [ "info_colonnes_dataframe(df1_tags)" ] }, { "cell_type": "code", "execution_count": null, "id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980", "metadata": {}, "outputs": [], "source": [ "info_colonnes_dataframe(df1_structure_tag_mappings)" ] }, { "cell_type": "code", "execution_count": null, "id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361", "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_colwidth', None)\n", "\n", "print(df1_tags['name'])" ] }, { "cell_type": "code", "execution_count": null, "id": "a59bf932-5b54-4600-81f5-c55ac93ae510", "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_rows', None)" ] }, { "cell_type": "code", "execution_count": null, "id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1", "metadata": { "scrolled": true }, "outputs": [], "source": [ "print(df1_tags['name'])" ] }, { "cell_type": "markdown", "id": "76bffba1-5f7e-4308-9224-437ca66148f8", "metadata": {}, "source": [ "## KPI sur target_type" ] }, { "cell_type": "code", "execution_count": null, "id": "f6daf22e-6583-4431-a467-660a1dd4e5a4", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d91d5895", "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_colwidth', None)\n" ] }, { "cell_type": "markdown", "id": "c58b17d3", "metadata": {}, "source": [ "Raisonnement : on prends les target_type qui représente 90% des clients et on fait des catégories dessus." ] }, { "cell_type": "code", "execution_count": null, "id": "d74426b3", "metadata": {}, "outputs": [], "source": [ "targets = load_dataset_2(\"3\", \"targets\")\n", "target_types = load_dataset_2(\"3\", \"target_types\")\n", "\n", "# target_all = pd.merge(targets, target_types, left_on= 'target_type_id', right_on= 'id' ,how = 'inner')\n" ] }, { "cell_type": "code", "execution_count": 42, "id": "6930bff5", "metadata": {}, "outputs": [], "source": [ "def print_main_target(tenant_id, nb_print = 40):\n", " df_target = display_databases(tenant_id, \"target_information\")\n", "\n", " print('Nombre de ciblage : ', len(df_target))\n", " nb_customers = df_target['customer_id'].nunique()\n", " print('Nombre de client avec étiquette target : ', nb_customers) \n", "\n", " nb_custumers_per_target = df_target.groupby(\"target_name\")['customer_id'].count().reset_index().sort_values('customer_id', ascending=False)\n", " nb_custumers_per_target['cumulative_customers'] = nb_custumers_per_target['customer_id'].cumsum()/len(df_target)\n", " nb_custumers_per_target['customer_id'] = nb_custumers_per_target['customer_id']/nb_customers\n", "\n", " return nb_custumers_per_target.head(nb_print)" ] }, { "cell_type": "code", "execution_count": 47, "id": "1e7ee1a0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_1/target_information.csv\n", "Nombre de ciblage : 768024\n", "Nombre de client avec étiquette target : 151159\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
target_namecustomer_idcumulative_customers
161consentement optin mediation specialisee0.9923330.195306
160consentement optin jeune public0.9921940.390585
158consentement optin b2c0.7204930.532390
5Arenametrix_bascule tel vers sib0.2329730.578242
165consentement optout b2c0.2283890.623193
19COM Inscrits NL générale (historique)0.1521910.653146
162consentement optin newsletter generale0.1461710.681915
169consentement optout newsletter generale0.1247360.706465
170consentement optout scolaires0.1041550.726964
166consentement optout dre0.0947880.745620
164consentement optout b2b0.0940670.764134
126Inscrits NL générale (export_291019 + operation_videomaton)0.0931870.782474
157consentement optin b2b0.0842490.799056
216ddcp_visiteurs dps 0106220.0817350.815142
20Contacts_prenomsdoubles0.0770250.830302
115FORMATION _ acheteurs optin last year0.0693640.843954
214ddcp_promo_visiteurs occasionnels_musee_8mois0.0439270.852600
189ddcp_promo_md_musée_dps 0110190.0397590.860425
188ddcp_promo_MD_billet_musée_oct_2019_agarder20.0362660.867563
163consentement optin scolaires0.0320790.873876
159consentement optin dre0.0299490.879771
34DDCP Newsletter enseignants0.0298360.885643
36DDCP Newsletter jeune public0.0255490.890671
127Inscrits NL générale site web0.0246890.895531
145Votre première liste0.0245770.900368
61DDCP billets famille0.0238760.905067
106DRE MucemLab0.0152290.908064
39DDCP Newsletter relais champ social0.0150170.911020
110DRE institutionnels0.0147460.913922
48DDCP PROMO Participants ateliers (adultes et enfants)0.0129270.916466
74DDCP promo Plan B 2019 (concerts)0.0128870.919003
72DDCP promo MD pass musées dps oct 20180.0118090.921327
94DDCP rentrée culturelle 20230.0116240.923614
23DDCP MD Procès du Siècle0.0111410.925807
186ddcp_md_scene_ouverte_au_talent0.0104330.927860
108DRE chercheurs0.0103000.929888
220festival_jean_rouch0.0099370.931843
105DRE Festival Jean Rouch0.0099370.933799
275structures_etiquette champ social0.0098440.935736
86DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers)0.0085540.937420
128Inscrits NL jeune public site web0.0082630.939046
260rappel po barvalo0.0082560.940671
104DDCP_marseille_jazz_20230.0069000.942029
32DDCP Newsletter centres de loisirs0.0068270.943373
13Autres_interet_exposition0.0067540.944702
228import_arenametrix_contactstousecardouv_expo0.0062120.945925
117Formation clients fidèles0.0060470.947115
22DDCP Cine 20230.0056560.948228
40DDCP OLBJ! 20230.0054640.949304
240journee-de-l-inclusion_20230601_21h250.0053260.950352
137Questionnaire 2 satisfaction scolaire0.0052590.951387
93DDCP rendez-vous de septembre offre spéciale0.0052530.952421
135Plan B 2018 (électro)0.0050810.953421
270save_the_date_populaire0.0049480.954395
132Newsletter CCR (passerelle)0.0047830.955336
116Fichier institutionnel (ne pas utiliser sans autorisation)0.0045380.956229
222fichier institutionnel_ne_pas_toucher0.0045320.957121
266reservations_payees_pass_culture_190422_au_3101230.0044920.958005
102DDCP spectateurs Marseille Jazz 18-19-210.0044320.958878
147acid arab0.0044130.959746
\n", "
" ], "text/plain": [ " target_name \\\n", "161 consentement optin mediation specialisee \n", "160 consentement optin jeune public \n", "158 consentement optin b2c \n", "5 Arenametrix_bascule tel vers sib \n", "165 consentement optout b2c \n", "19 COM Inscrits NL générale (historique) \n", "162 consentement optin newsletter generale \n", "169 consentement optout newsletter generale \n", "170 consentement optout scolaires \n", "166 consentement optout dre \n", "164 consentement optout b2b \n", "126 Inscrits NL générale (export_291019 + operation_videomaton) \n", "157 consentement optin b2b \n", "216 ddcp_visiteurs dps 010622 \n", "20 Contacts_prenomsdoubles \n", "115 FORMATION _ acheteurs optin last year \n", "214 ddcp_promo_visiteurs occasionnels_musee_8mois \n", "189 ddcp_promo_md_musée_dps 011019 \n", "188 ddcp_promo_MD_billet_musée_oct_2019_agarder2 \n", "163 consentement optin scolaires \n", "159 consentement optin dre \n", "34 DDCP Newsletter enseignants \n", "36 DDCP Newsletter jeune public \n", "127 Inscrits NL générale site web \n", "145 Votre première liste \n", "61 DDCP billets famille \n", "106 DRE MucemLab \n", "39 DDCP Newsletter relais champ social \n", "110 DRE institutionnels \n", "48 DDCP PROMO Participants ateliers (adultes et enfants) \n", "74 DDCP promo Plan B 2019 (concerts) \n", "72 DDCP promo MD pass musées dps oct 2018 \n", "94 DDCP rentrée culturelle 2023 \n", "23 DDCP MD Procès du Siècle \n", "186 ddcp_md_scene_ouverte_au_talent \n", "108 DRE chercheurs \n", "220 festival_jean_rouch \n", "105 DRE Festival Jean Rouch \n", "275 structures_etiquette champ social \n", "86 DDCP promo spectateurs prog 21-22 (spectacles, ciné, ateliers) \n", "128 Inscrits NL jeune public site web \n", "260 rappel po barvalo \n", "104 DDCP_marseille_jazz_2023 \n", "32 DDCP Newsletter centres de loisirs \n", "13 Autres_interet_exposition \n", "228 import_arenametrix_contactstousecardouv_expo \n", "117 Formation clients fidèles \n", "22 DDCP Cine 2023 \n", "40 DDCP OLBJ! 2023 \n", "240 journee-de-l-inclusion_20230601_21h25 \n", "137 Questionnaire 2 satisfaction scolaire \n", "93 DDCP rendez-vous de septembre offre spéciale \n", "135 Plan B 2018 (électro) \n", "270 save_the_date_populaire \n", "132 Newsletter CCR (passerelle) \n", "116 Fichier institutionnel (ne pas utiliser sans autorisation) \n", "222 fichier institutionnel_ne_pas_toucher \n", "266 reservations_payees_pass_culture_190422_au_310123 \n", "102 DDCP spectateurs Marseille Jazz 18-19-21 \n", "147 acid arab \n", "\n", " customer_id cumulative_customers \n", "161 0.992333 0.195306 \n", "160 0.992194 0.390585 \n", "158 0.720493 0.532390 \n", "5 0.232973 0.578242 \n", "165 0.228389 0.623193 \n", "19 0.152191 0.653146 \n", "162 0.146171 0.681915 \n", "169 0.124736 0.706465 \n", "170 0.104155 0.726964 \n", "166 0.094788 0.745620 \n", "164 0.094067 0.764134 \n", "126 0.093187 0.782474 \n", "157 0.084249 0.799056 \n", "216 0.081735 0.815142 \n", "20 0.077025 0.830302 \n", "115 0.069364 0.843954 \n", "214 0.043927 0.852600 \n", "189 0.039759 0.860425 \n", "188 0.036266 0.867563 \n", "163 0.032079 0.873876 \n", "159 0.029949 0.879771 \n", "34 0.029836 0.885643 \n", "36 0.025549 0.890671 \n", "127 0.024689 0.895531 \n", "145 0.024577 0.900368 \n", "61 0.023876 0.905067 \n", "106 0.015229 0.908064 \n", "39 0.015017 0.911020 \n", "110 0.014746 0.913922 \n", "48 0.012927 0.916466 \n", "74 0.012887 0.919003 \n", "72 0.011809 0.921327 \n", "94 0.011624 0.923614 \n", "23 0.011141 0.925807 \n", "186 0.010433 0.927860 \n", "108 0.010300 0.929888 \n", "220 0.009937 0.931843 \n", "105 0.009937 0.933799 \n", "275 0.009844 0.935736 \n", "86 0.008554 0.937420 \n", "128 0.008263 0.939046 \n", "260 0.008256 0.940671 \n", "104 0.006900 0.942029 \n", "32 0.006827 0.943373 \n", "13 0.006754 0.944702 \n", "228 0.006212 0.945925 \n", "117 0.006047 0.947115 \n", "22 0.005656 0.948228 \n", "40 0.005464 0.949304 \n", "240 0.005326 0.950352 \n", "137 0.005259 0.951387 \n", "93 0.005253 0.952421 \n", "135 0.005081 0.953421 \n", "270 0.004948 0.954395 \n", "132 0.004783 0.955336 \n", "116 0.004538 0.956229 \n", "222 0.004532 0.957121 \n", "266 0.004492 0.958005 \n", "102 0.004432 0.958878 \n", "147 0.004413 0.959746 " ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.set_option(\"max_colwidth\", None)\n", "print_main_target('1', 60)" ] }, { "cell_type": "code", "execution_count": 48, "id": "b57a28ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_2/target_information.csv\n", "Nombre de ciblage : 260283\n", "Nombre de client avec étiquette target : 233320\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
target_namecustomer_idcumulative_customers
13Schokoladentour & Führungen Individuals0.9279060.831783
3Chocolateria Kurse 20230.0739630.898084
16mailxpert_contacts_2023-07-18_12-04-00 langue0.0255190.920959
15mailxpert_contacts_2023-07-18_12-04-000.0255190.943834
8Newsletter opt-in Allgemein0.0228360.964304
12Schokoladentour & Führungen Gruppen / Schulen0.0115550.974662
5Newsletter DE0.0107490.984298
7Newsletter FR0.0085200.991936
6Newsletter EN0.0042860.995778
4Frauen in Zürich - Schulung0.0010030.996677
17mailxpert_contacts_2023-07-18_13-25-45_inaktiv0.0004710.997099
11Opt-in-Website DE0.0000300.997126
9Opt-in Website EN0.0000090.997134
10Opt-in Website FR0.0000040.997138
14Votre première liste0.0000040.997142
1Activated contact EN0.0000040.997145
2Activated contact FR0.0000040.997149
0Activated contact DE0.0000040.997153
\n", "
" ], "text/plain": [ " target_name customer_id \\\n", "13 Schokoladentour & Führungen Individuals 0.927906 \n", "3 Chocolateria Kurse 2023 0.073963 \n", "16 mailxpert_contacts_2023-07-18_12-04-00 langue 0.025519 \n", "15 mailxpert_contacts_2023-07-18_12-04-00 0.025519 \n", "8 Newsletter opt-in Allgemein 0.022836 \n", "12 Schokoladentour & Führungen Gruppen / Schulen 0.011555 \n", "5 Newsletter DE 0.010749 \n", "7 Newsletter FR 0.008520 \n", "6 Newsletter EN 0.004286 \n", "4 Frauen in Zürich - Schulung 0.001003 \n", "17 mailxpert_contacts_2023-07-18_13-25-45_inaktiv 0.000471 \n", "11 Opt-in-Website DE 0.000030 \n", "9 Opt-in Website EN 0.000009 \n", "10 Opt-in Website FR 0.000004 \n", "14 Votre première liste 0.000004 \n", "1 Activated contact EN 0.000004 \n", "2 Activated contact FR 0.000004 \n", "0 Activated contact DE 0.000004 \n", "\n", " cumulative_customers \n", "13 0.831783 \n", "3 0.898084 \n", "16 0.920959 \n", "15 0.943834 \n", "8 0.964304 \n", "12 0.974662 \n", "5 0.984298 \n", "7 0.991936 \n", "6 0.995778 \n", "4 0.996677 \n", "17 0.997099 \n", "11 0.997126 \n", "9 0.997134 \n", "10 0.997138 \n", "14 0.997142 \n", "1 0.997145 \n", "2 0.997149 \n", "0 0.997153 " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print_main_target('2', 25)" ] }, { "cell_type": "code", "execution_count": 56, "id": "9a65991f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_3/target_information.csv\n", "Nombre de ciblage : 1617362\n", "Nombre de client avec étiquette target : 257018\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
target_namecustomer_idcumulative_customers
67MKG_NLmensuelle_2021_OK0.9723480.154518
66MKG_NLmensuelle_20210.9560070.306439
119consent_optin_nl0.6366480.407609
115consent_optin_general0.6025060.503355
78Mkg_NL_mensuelle30.4041620.567581
125consent_optout_general0.3681260.626081
104TEST LOCBASE0.3505320.681784
68MKG_Non_inscrit_liste_08-220.3106050.731143
116consent_optin_general_HISTORIQUE0.3013450.779030
80Mkg_Zone_C0.1352980.800531
7Acheteurs_100km_visite_depuismax5ans0.0911490.815015
54Inscription NL ancien site web0.0834770.828281
112consent_optin_equestre0.0832160.841505
79Mkg_Zone_B0.0798890.854200
65MKG_2022_ZoneB&ZoneC_Famille0.0725360.865727
111consent_optin_b2b0.0643880.875959
102Soft_Bounce_yahoo0.0641820.886158
100Scénario Anniversaire0.0512490.894303
37B2B_scolaire_et_centres_de_loisirs_20230.0467320.901729
20B2B_Sans étiquette0.0404720.908160
122consent_optout_equestre0.0388650.914336
17B2B_Inscrits newsletter Scolaires0.0380750.920387
28B2B_historique_newsletter_SCOLAIRES0.0380400.926432
118consent_optin_jdp0.0361100.932170
76MKG_aire cantilienne0.0319080.937241
21B2B_Sans étiquette FR+BE0.0291650.941876
108b2b - écoles mai 20210.0285740.946416
86Ouvreur_NL_juin_20210.0181930.949308
126consent_optout_jdp0.0168160.951980
127consent_optout_nl0.0166330.954623
13B2B_CE_20230.0164890.957243
106Visiteurs Aout-Sept sans questionnaire0.0162750.959830
89Pass Annuel en cours de validité0.0115400.961663
114consent_optin_expositions0.0113880.963473
22B2B_Sans étiquette hors FR+BE0.0113070.965270
113consent_optin_evenements0.0112400.967056
32B2B_liste_à_requalifier_CE0.0107420.968763
110consent_optin_abonnes_passannuels0.0096650.970299
152liste mécènes donateurs 01012023-311020230.0087460.971689
34B2B_liste_à_requalifier_SCOLAIRES0.0086880.973070
153liste newsletter mécénat0.0086060.974437
144liste des donateurs iraiser don ponctuel 10122020-200120220.0075910.975644
134don ponctuel iRaiser 2501220.0075870.976849
77MKG_visiteurs_juin_ES0.0074980.978041
72MKG_acheteurs_domaine_noel_20210.0067580.979115
6Acheteurs Journées des Plantes oct. 20220.0060380.980074
96Réponse au formulaire de satisfaction0.0058710.981007
15B2B_Inscrits newsletter Collectivités et CSE0.0057860.981927
26B2B_historique_newsletter_CE0.0057350.982838
70MKG_acheteurs_1mois_pass_sanitaire0.0057310.983749
157mec_expos_automne_20230.0049720.984539
2Abonnés Pass Annuel - dynamique0.0048560.985311
146liste diffusion invitation Ingres0.0045640.986036
161rattrpostvisite_nov210.0040700.986683
166réunion publique forêt 20230.0033620.987217
94Rattrapage_postvisite_novdec21v00.0033580.987750
71MKG_acheteurs_domaine_et_noel_20210.0030540.988236
93Rattrapage_postvisite_novdec21_VF0.0030150.988715
14B2B_GUIDES_20230.0026500.989136
23B2B_TOANGLOPHONE_20230.0025210.989537
33B2B_liste_à_requalifier_GUIDES0.0024050.989919
64MKG_2021_Acheteurs_JDP_Octobre_rattrap0.0022680.990279
159rattrapage1211_logs-071021_1211210.0021200.990616
141jdp_invités_2_entrées_ oct20230.0021200.990953
109château de chantilly questionnaire0.0020040.991272
150liste invités avant-première jdp mai 20230.0020000.991589
136gece0.0019800.991904
12B2B_Autocariste_20230.0016920.992173
158mkg_2021_acheteurs_jdp_octobre_rattr_exclure_new0.0016070.992428
43Formation_journéedesplantes750.0015410.992673
\n", "
" ], "text/plain": [ " target_name customer_id \\\n", "67 MKG_NLmensuelle_2021_OK 0.972348 \n", "66 MKG_NLmensuelle_2021 0.956007 \n", "119 consent_optin_nl 0.636648 \n", "115 consent_optin_general 0.602506 \n", "78 Mkg_NL_mensuelle3 0.404162 \n", "125 consent_optout_general 0.368126 \n", "104 TEST LOCBASE 0.350532 \n", "68 MKG_Non_inscrit_liste_08-22 0.310605 \n", "116 consent_optin_general_HISTORIQUE 0.301345 \n", "80 Mkg_Zone_C 0.135298 \n", "7 Acheteurs_100km_visite_depuismax5ans 0.091149 \n", "54 Inscription NL ancien site web 0.083477 \n", "112 consent_optin_equestre 0.083216 \n", "79 Mkg_Zone_B 0.079889 \n", "65 MKG_2022_ZoneB&ZoneC_Famille 0.072536 \n", "111 consent_optin_b2b 0.064388 \n", "102 Soft_Bounce_yahoo 0.064182 \n", "100 Scénario Anniversaire 0.051249 \n", "37 B2B_scolaire_et_centres_de_loisirs_2023 0.046732 \n", "20 B2B_Sans étiquette 0.040472 \n", "122 consent_optout_equestre 0.038865 \n", "17 B2B_Inscrits newsletter Scolaires 0.038075 \n", "28 B2B_historique_newsletter_SCOLAIRES 0.038040 \n", "118 consent_optin_jdp 0.036110 \n", "76 MKG_aire cantilienne 0.031908 \n", "21 B2B_Sans étiquette FR+BE 0.029165 \n", "108 b2b - écoles mai 2021 0.028574 \n", "86 Ouvreur_NL_juin_2021 0.018193 \n", "126 consent_optout_jdp 0.016816 \n", "127 consent_optout_nl 0.016633 \n", "13 B2B_CE_2023 0.016489 \n", "106 Visiteurs Aout-Sept sans questionnaire 0.016275 \n", "89 Pass Annuel en cours de validité 0.011540 \n", "114 consent_optin_expositions 0.011388 \n", "22 B2B_Sans étiquette hors FR+BE 0.011307 \n", "113 consent_optin_evenements 0.011240 \n", "32 B2B_liste_à_requalifier_CE 0.010742 \n", "110 consent_optin_abonnes_passannuels 0.009665 \n", "152 liste mécènes donateurs 01012023-31102023 0.008746 \n", "34 B2B_liste_à_requalifier_SCOLAIRES 0.008688 \n", "153 liste newsletter mécénat 0.008606 \n", "144 liste des donateurs iraiser don ponctuel 10122020-20012022 0.007591 \n", "134 don ponctuel iRaiser 250122 0.007587 \n", "77 MKG_visiteurs_juin_ES 0.007498 \n", "72 MKG_acheteurs_domaine_noel_2021 0.006758 \n", "6 Acheteurs Journées des Plantes oct. 2022 0.006038 \n", "96 Réponse au formulaire de satisfaction 0.005871 \n", "15 B2B_Inscrits newsletter Collectivités et CSE 0.005786 \n", "26 B2B_historique_newsletter_CE 0.005735 \n", "70 MKG_acheteurs_1mois_pass_sanitaire 0.005731 \n", "157 mec_expos_automne_2023 0.004972 \n", "2 Abonnés Pass Annuel - dynamique 0.004856 \n", "146 liste diffusion invitation Ingres 0.004564 \n", "161 rattrpostvisite_nov21 0.004070 \n", "166 réunion publique forêt 2023 0.003362 \n", "94 Rattrapage_postvisite_novdec21v0 0.003358 \n", "71 MKG_acheteurs_domaine_et_noel_2021 0.003054 \n", "93 Rattrapage_postvisite_novdec21_VF 0.003015 \n", "14 B2B_GUIDES_2023 0.002650 \n", "23 B2B_TOANGLOPHONE_2023 0.002521 \n", "33 B2B_liste_à_requalifier_GUIDES 0.002405 \n", "64 MKG_2021_Acheteurs_JDP_Octobre_rattrap 0.002268 \n", "159 rattrapage1211_logs-071021_121121 0.002120 \n", "141 jdp_invités_2_entrées_ oct2023 0.002120 \n", "109 château de chantilly questionnaire 0.002004 \n", "150 liste invités avant-première jdp mai 2023 0.002000 \n", "136 gece 0.001980 \n", "12 B2B_Autocariste_2023 0.001692 \n", "158 mkg_2021_acheteurs_jdp_octobre_rattr_exclure_new 0.001607 \n", "43 Formation_journéedesplantes75 0.001541 \n", "\n", " cumulative_customers \n", "67 0.154518 \n", "66 0.306439 \n", "119 0.407609 \n", "115 0.503355 \n", "78 0.567581 \n", "125 0.626081 \n", "104 0.681784 \n", "68 0.731143 \n", "116 0.779030 \n", "80 0.800531 \n", "7 0.815015 \n", "54 0.828281 \n", "112 0.841505 \n", "79 0.854200 \n", "65 0.865727 \n", "111 0.875959 \n", "102 0.886158 \n", "100 0.894303 \n", "37 0.901729 \n", "20 0.908160 \n", "122 0.914336 \n", "17 0.920387 \n", "28 0.926432 \n", "118 0.932170 \n", "76 0.937241 \n", "21 0.941876 \n", "108 0.946416 \n", "86 0.949308 \n", "126 0.951980 \n", "127 0.954623 \n", "13 0.957243 \n", "106 0.959830 \n", "89 0.961663 \n", "114 0.963473 \n", "22 0.965270 \n", "113 0.967056 \n", "32 0.968763 \n", "110 0.970299 \n", "152 0.971689 \n", "34 0.973070 \n", "153 0.974437 \n", "144 0.975644 \n", "134 0.976849 \n", "77 0.978041 \n", "72 0.979115 \n", "6 0.980074 \n", "96 0.981007 \n", "15 0.981927 \n", "26 0.982838 \n", "70 0.983749 \n", "157 0.984539 \n", "2 0.985311 \n", "146 0.986036 \n", "161 0.986683 \n", "166 0.987217 \n", "94 0.987750 \n", "71 0.988236 \n", "93 0.988715 \n", "14 0.989136 \n", "23 0.989537 \n", "33 0.989919 \n", "64 0.990279 \n", "159 0.990616 \n", "141 0.990953 \n", "109 0.991272 \n", "150 0.991589 \n", "136 0.991904 \n", "12 0.992173 \n", "158 0.992428 \n", "43 0.992673 " ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print_main_target('3', 70)" ] }, { "cell_type": "code", "execution_count": 54, "id": "c66a4dc1", "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_rows', None)\n" ] }, { "cell_type": "code", "execution_count": 57, "id": "5f34b8bf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_4/target_information.csv\n", "Nombre de ciblage : 4627640\n", "Nombre de client avec étiquette target : 320813\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
target_namecustomer_idcumulative_customers
232Tous les contacts mis à jour0.9999910.069325
76Base données0.9999910.138650
191Office de Tourisme0.9999910.207974
128Globale sans VIP0.9554880.274214
112Contacts structures0.9299690.338684
98Cible gratuité IMA COMEDY0.6362460.382792
232 IEME ENVOI IMA COMEDY CLUB0.6303890.426494
64Actions Marketing ARABOFOLIES0.6279170.470025
171Liste globale sans VIP0.5821830.510385
126Formulaire inscription mallette \"Cultures en partage\"0.5328310.547324
78CAMPAGNE ADHESION 20230.4493710.578477
234Tous les optins0.4125460.607076
192Optin 20230.3650570.632384
170Liste globale optin-15-01-20210.3254820.654948
1010-03-sb-dolist0.1938330.668386
195Origine - Nouba0.1924520.681728
158LIVE2022_Intérêt Expositions musée0.1735500.693759
414old_Intéressés par la Musique0.1665050.705302
415old_Intérêt Danse0.1635720.716642
100Cible offre DAOUD DEPARDON0.1303720.725680
1213-04-2022-vente 20210.1288040.734609
73Arabofolies Juillet 20220.1091230.742174
137Intérêt LGBTQ+0.1089170.749725
274blacklistés ima0.1054070.757032
208Public traditionnel0.0838210.762843
159LIVE2022_Intérêt Humour0.0828580.768587
99Cible jeunes humour0.0803120.774155
213Relance gratuité IMA COMEDY CLUB0.0792050.779646
101Cible rencontres et débats0.0767430.784966
189Nouveaux inscrits newsletter0.0762000.790249
349interet nuit du cinéma0.0722600.795258
141Intérêt prononcé pour la nuit du ramadan0.0722540.800267
87Cible Algérie0.0703370.805143
93Cible News offre spéciale humour0.0692960.809947
140Intérêt musique électro0.0687160.814711
280cible Histoire et feminisme0.0685850.819466
174Liste relais pour présentation 20230.0674040.824139
138Intérêt musique Orientale0.0660820.828720
1816-07-21-nuit-du-cinema0.0651660.833237
204Profil Client Expos Divas (Geo)0.0634010.837633
265araborolies/Divas/relance0.0616470.841906
203Profil Client Expo Divas0.0611380.846145
233Tous les inscrits aux newsletters via le formulaire du site web0.0574070.850125
243VIP Générale0.0536820.853846
226Strcutures sans VIP0.0533960.857548
67Agi pour buren0.0515750.861123
144Invitation à l'exposition Palestine LANG0.0510920.864665
62Acheteurs individuels de l'expo Juifs d'orient statique0.0465260.867891
61Acheteurs individuels de l'expo Juifs d'orient0.0465130.871115
95Cible arabic Sound system0.0461640.874316
244VIP STATIQUE0.0411580.877169
245VIP Téléchargement0.0407370.879993
102Cible scolaire 20220.0403130.882788
90Cible Maroc0.0398270.885549
91Cible Maroc0.0398270.888310
4126mai-2023-Structures-invit-palestine0.0391880.891027
393liste_contacts_agi_2021_02_16_0.0336180.893357
450sb-fichier-eudonet-ok-18-05-210.0320560.895579
404_11_22_eudonet0.0318570.897788
215SB-18-05-VIP-eudonet0.0318570.899996
175Liste vernissage0.0313640.902171
235Tous les relais0.0310900.904326
252Visiteurs expo pour questionnaires0.0299300.906401
223Scolaires - Actions Educatives 24/11/20210.0298710.908472
92Cible Musique Judeo-arabe0.0292660.910501
1315-09-2023-Cible-Palestine0.0285310.912478
162LIVE2022_Intérêts Rencontres, débats et conférences0.0269280.914345
282cible photo0.0260560.916152
3826-MAI_STRUCTURE-2023-OK0.0254950.917919
507-12-20-Relais-invitatation-divas0.0249090.919646
410old_Amis de l'IMA0.0231600.921251
222Scolaires - Actions Educatives 24/01/20230.0227240.922827
198PALESTINE0.0209030.924276
249Vignes et tilleuls0.0204390.925693
3926-mai-11H10-relais0.0195780.927050
110Contacts Librairie0.0191140.928375
194Origine - Inscription manuelle0.0183070.929644
196Origine - QR code0.0182940.930913
59Acheteurs Daoud Depardon0.0182320.932176
473événements autour de Habibi0.0177550.933407
2017-04-21-nuits-ducinema0.0174840.934619
607-12-20-liste-invites-presentation-divas0.0173370.935821
278catégorie Cinéma0.0167480.936982
250Visiteurs Palestine0.0163020.938113
248Vernissages Algérie0.0147030.939132
242VIP Algérie0.0146660.940149
445save the date invités vernissage0.0144320.941149
205Profil Contact Expo Divas - Juillet0.0142820.942139
1917-04-2021-autres-liste-statique-cinema0.0138620.943100
279catégorie rencontres et débats0.0132850.944021
325fichier-dolist-05-12-20-relais0.0131630.944934
177MAILING SAVE THE DATE PARFUMS0.0131540.945846
96Cible enfants/ famille janvier 230.0125150.946713
129Globale vernissage Samarcande0.0110780.947481
251Visiteurs Palestine mi-expo questionnaire0.0107350.948225
88Cible LIBAN0.0100030.948919
2118-11-2021-liste i,augurationdu23-18h3à0.0099750.949610
431professionnels de l'écologie0.0097690.950288
142Intérêt écologie (pro)0.0097690.950965
323fichier pro écologie0.0097690.951642
\n", "
" ], "text/plain": [ " target_name \\\n", "232 Tous les contacts mis à jour \n", "76 Base données \n", "191 Office de Tourisme \n", "128 Globale sans VIP \n", "112 Contacts structures \n", "98 Cible gratuité IMA COMEDY \n", "23 2 IEME ENVOI IMA COMEDY CLUB \n", "64 Actions Marketing ARABOFOLIES \n", "171 Liste globale sans VIP \n", "126 Formulaire inscription mallette \"Cultures en partage\" \n", "78 CAMPAGNE ADHESION 2023 \n", "234 Tous les optins \n", "192 Optin 2023 \n", "170 Liste globale optin-15-01-2021 \n", "10 10-03-sb-dolist \n", "195 Origine - Nouba \n", "158 LIVE2022_Intérêt Expositions musée \n", "414 old_Intéressés par la Musique \n", "415 old_Intérêt Danse \n", "100 Cible offre DAOUD DEPARDON \n", "12 13-04-2022-vente 2021 \n", "73 Arabofolies Juillet 2022 \n", "137 Intérêt LGBTQ+ \n", "274 blacklistés ima \n", "208 Public traditionnel \n", "159 LIVE2022_Intérêt Humour \n", "99 Cible jeunes humour \n", "213 Relance gratuité IMA COMEDY CLUB \n", "101 Cible rencontres et débats \n", "189 Nouveaux inscrits newsletter \n", "349 interet nuit du cinéma \n", "141 Intérêt prononcé pour la nuit du ramadan \n", "87 Cible Algérie \n", "93 Cible News offre spéciale humour \n", "140 Intérêt musique électro \n", "280 cible Histoire et feminisme \n", "174 Liste relais pour présentation 2023 \n", "138 Intérêt musique Orientale \n", "18 16-07-21-nuit-du-cinema \n", "204 Profil Client Expos Divas (Geo) \n", "265 araborolies/Divas/relance \n", "203 Profil Client Expo Divas \n", "233 Tous les inscrits aux newsletters via le formulaire du site web \n", "243 VIP Générale \n", "226 Strcutures sans VIP \n", "67 Agi pour buren \n", "144 Invitation à l'exposition Palestine LANG \n", "62 Acheteurs individuels de l'expo Juifs d'orient statique \n", "61 Acheteurs individuels de l'expo Juifs d'orient \n", "95 Cible arabic Sound system \n", "244 VIP STATIQUE \n", "245 VIP Téléchargement \n", "102 Cible scolaire 2022 \n", "90 Cible Maroc \n", "91 Cible Maroc \n", "41 26mai-2023-Structures-invit-palestine \n", "393 liste_contacts_agi_2021_02_16_ \n", "450 sb-fichier-eudonet-ok-18-05-21 \n", "4 04_11_22_eudonet \n", "215 SB-18-05-VIP-eudonet \n", "175 Liste vernissage \n", "235 Tous les relais \n", "252 Visiteurs expo pour questionnaires \n", "223 Scolaires - Actions Educatives 24/11/2021 \n", "92 Cible Musique Judeo-arabe \n", "13 15-09-2023-Cible-Palestine \n", "162 LIVE2022_Intérêts Rencontres, débats et conférences \n", "282 cible photo \n", "38 26-MAI_STRUCTURE-2023-OK \n", "5 07-12-20-Relais-invitatation-divas \n", "410 old_Amis de l'IMA \n", "222 Scolaires - Actions Educatives 24/01/2023 \n", "198 PALESTINE \n", "249 Vignes et tilleuls \n", "39 26-mai-11H10-relais \n", "110 Contacts Librairie \n", "194 Origine - Inscription manuelle \n", "196 Origine - QR code \n", "59 Acheteurs Daoud Depardon \n", "473 événements autour de Habibi \n", "20 17-04-21-nuits-ducinema \n", "6 07-12-20-liste-invites-presentation-divas \n", "278 catégorie Cinéma \n", "250 Visiteurs Palestine \n", "248 Vernissages Algérie \n", "242 VIP Algérie \n", "445 save the date invités vernissage \n", "205 Profil Contact Expo Divas - Juillet \n", "19 17-04-2021-autres-liste-statique-cinema \n", "279 catégorie rencontres et débats \n", "325 fichier-dolist-05-12-20-relais \n", "177 MAILING SAVE THE DATE PARFUMS \n", "96 Cible enfants/ famille janvier 23 \n", "129 Globale vernissage Samarcande \n", "251 Visiteurs Palestine mi-expo questionnaire \n", "88 Cible LIBAN \n", "21 18-11-2021-liste i,augurationdu23-18h3à \n", "431 professionnels de l'écologie \n", "142 Intérêt écologie (pro) \n", "323 fichier pro écologie \n", "\n", " customer_id cumulative_customers \n", "232 0.999991 0.069325 \n", "76 0.999991 0.138650 \n", "191 0.999991 0.207974 \n", "128 0.955488 0.274214 \n", "112 0.929969 0.338684 \n", "98 0.636246 0.382792 \n", "23 0.630389 0.426494 \n", "64 0.627917 0.470025 \n", "171 0.582183 0.510385 \n", "126 0.532831 0.547324 \n", "78 0.449371 0.578477 \n", "234 0.412546 0.607076 \n", "192 0.365057 0.632384 \n", "170 0.325482 0.654948 \n", "10 0.193833 0.668386 \n", "195 0.192452 0.681728 \n", "158 0.173550 0.693759 \n", "414 0.166505 0.705302 \n", "415 0.163572 0.716642 \n", "100 0.130372 0.725680 \n", "12 0.128804 0.734609 \n", "73 0.109123 0.742174 \n", "137 0.108917 0.749725 \n", "274 0.105407 0.757032 \n", "208 0.083821 0.762843 \n", "159 0.082858 0.768587 \n", "99 0.080312 0.774155 \n", "213 0.079205 0.779646 \n", "101 0.076743 0.784966 \n", "189 0.076200 0.790249 \n", "349 0.072260 0.795258 \n", "141 0.072254 0.800267 \n", "87 0.070337 0.805143 \n", "93 0.069296 0.809947 \n", "140 0.068716 0.814711 \n", "280 0.068585 0.819466 \n", "174 0.067404 0.824139 \n", "138 0.066082 0.828720 \n", "18 0.065166 0.833237 \n", "204 0.063401 0.837633 \n", "265 0.061647 0.841906 \n", "203 0.061138 0.846145 \n", "233 0.057407 0.850125 \n", "243 0.053682 0.853846 \n", "226 0.053396 0.857548 \n", "67 0.051575 0.861123 \n", "144 0.051092 0.864665 \n", "62 0.046526 0.867891 \n", "61 0.046513 0.871115 \n", "95 0.046164 0.874316 \n", "244 0.041158 0.877169 \n", "245 0.040737 0.879993 \n", "102 0.040313 0.882788 \n", "90 0.039827 0.885549 \n", "91 0.039827 0.888310 \n", "41 0.039188 0.891027 \n", "393 0.033618 0.893357 \n", "450 0.032056 0.895579 \n", "4 0.031857 0.897788 \n", "215 0.031857 0.899996 \n", "175 0.031364 0.902171 \n", "235 0.031090 0.904326 \n", "252 0.029930 0.906401 \n", "223 0.029871 0.908472 \n", "92 0.029266 0.910501 \n", "13 0.028531 0.912478 \n", "162 0.026928 0.914345 \n", "282 0.026056 0.916152 \n", "38 0.025495 0.917919 \n", "5 0.024909 0.919646 \n", "410 0.023160 0.921251 \n", "222 0.022724 0.922827 \n", "198 0.020903 0.924276 \n", "249 0.020439 0.925693 \n", "39 0.019578 0.927050 \n", "110 0.019114 0.928375 \n", "194 0.018307 0.929644 \n", "196 0.018294 0.930913 \n", "59 0.018232 0.932176 \n", "473 0.017755 0.933407 \n", "20 0.017484 0.934619 \n", "6 0.017337 0.935821 \n", "278 0.016748 0.936982 \n", "250 0.016302 0.938113 \n", "248 0.014703 0.939132 \n", "242 0.014666 0.940149 \n", "445 0.014432 0.941149 \n", "205 0.014282 0.942139 \n", "19 0.013862 0.943100 \n", "279 0.013285 0.944021 \n", "325 0.013163 0.944934 \n", "177 0.013154 0.945846 \n", "96 0.012515 0.946713 \n", "129 0.011078 0.947481 \n", "251 0.010735 0.948225 \n", "88 0.010003 0.948919 \n", "21 0.009975 0.949610 \n", "431 0.009769 0.950288 \n", "142 0.009769 0.950965 \n", "323 0.009769 0.951642 " ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print_main_target('4', 100)" ] }, { "cell_type": "code", "execution_count": null, "id": "40fe3676", "metadata": {}, "outputs": [], "source": [ "print_main_target('101', 100)" ] }, { "cell_type": "markdown", "id": "605cced5-052f-4a99-ac26-020c5d2ab633", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## KPI sur tags" ] }, { "cell_type": "code", "execution_count": 28, "id": "916c3e2b-04d3-4877-b894-8f26f10d926e", "metadata": {}, "outputs": [], "source": [ "customersplus = load_dataset_2(\"4\", \"customersplus\")[['id', 'structure_id']]" ] }, { "cell_type": "code", "execution_count": 35, "id": "46847b24-15a4-464e-969f-f16ed3653f1f", "metadata": {}, "outputs": [], "source": [ "structure_tag_mappings = load_dataset_2('4', \"structure_tag_mappings\")[['structure_id', 'tag_id']]" ] }, { "cell_type": "code", "execution_count": 34, "id": "3c10c69d-735f-453e-96bf-750697d965d0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "19427" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "customersplus[customersplus['structure_id'].notna()]['structure_id'].nunique()" ] }, { "cell_type": "code", "execution_count": 32, "id": "9b0e77b3-5f16-4484-9564-7d3826583418", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "33645" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(customersplus[customersplus['structure_id'].notna()])" ] }, { "cell_type": "code", "execution_count": 36, "id": "dfa27722-37f9-435a-8221-8aa6f9a4a107", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3431" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "structure_tag_mappings['structure_id'].nunique()" ] }, { "cell_type": "code", "execution_count": 26, "id": "2daabdd5-31e3-4918-9856-9bbc30cde602", "metadata": {}, "outputs": [], "source": [ "def tags_information(tenant_id, first_tags):\n", "\n", " customersplus = load_dataset_2(tenant_id, \"customersplus\")[['id', 'structure_id']]\n", " customersplus.rename(columns = {'id' : 'customer_id'}, inplace = True)\n", " tags = load_dataset_2(tenant_id, \"tags\")[['id', 'name']]\n", " tags.rename(columns = {'id' : 'tag_id', 'name' : 'tag_name'}, inplace = True)\n", " structure_tag_mappings = load_dataset_2(tenant_id, \"structure_tag_mappings\")[['structure_id', 'tag_id']]\n", " \n", " customer_tags = pd.merge(customersplus, structure_tag_mappings, on = 'structure_id', how = 'left')\n", " customer_tags = pd.merge(customer_tags, tags, on = 'tag_id', how = 'inner')\n", " \n", " nb_customers_with_tag = customer_tags['customer_id'].nunique()\n", " \n", " print('Nombre de client avec tag : ', nb_customers_with_tag)\n", " print('Proportion de clients avec tags : ', nb_customers_with_tag/len(customersplus))\n", " print('Moyenne de tags par client : ', len(customer_tags)/nb_customers_with_tag)\n", " \n", " info = customer_tags.groupby(['tag_id', 'tag_name'])['customer_id'].count().reset_index().sort_values('customer_id', ascending = False).head(first_tags)\n", "\n", " return info" ] }, { "cell_type": "code", "execution_count": 37, "id": "0b9f5f71-a927-4cc8-bb0c-9538e28d3553", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Nombre de client avec tag : 13320\n", "Proportion de clients avec tags : 0.0877089012682233\n", "Moyenne de tags par client : 2.1725975975975977\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tag_idtag_namecustomer_id
111029.0individuels3270
1811047.0groupes scolaires2417
411033.0association2308
011028.0structures culturelles2011
2211051.0etablissement ens scolaire1732
711036.0champ social1603
4311072.0etab d'enseignement1036
1411043.0etablissement public935
611035.0organisme de tourisme892
1611045.0centre de loisirs864
4411073.0musée, site & fondation786
2411053.0groupes etudiants758
311032.0entreprise750
1011039.0etablissement d'enseignement741
511034.0asso. culturelle692
1511044.0administration et collectivité676
1711046.0tour opérateur642
1911048.0entreprises515
7211619.0structures culturelles;musée, site & fondation427
811037.0handicap426
\n", "
" ], "text/plain": [ " tag_id tag_name customer_id\n", "1 11029.0 individuels 3270\n", "18 11047.0 groupes scolaires 2417\n", "4 11033.0 association 2308\n", "0 11028.0 structures culturelles 2011\n", "22 11051.0 etablissement ens scolaire 1732\n", "7 11036.0 champ social 1603\n", "43 11072.0 etab d'enseignement 1036\n", "14 11043.0 etablissement public 935\n", "6 11035.0 organisme de tourisme 892\n", "16 11045.0 centre de loisirs 864\n", "44 11073.0 musée, site & fondation 786\n", "24 11053.0 groupes etudiants 758\n", "3 11032.0 entreprise 750\n", "10 11039.0 etablissement d'enseignement 741\n", "5 11034.0 asso. culturelle 692\n", "15 11044.0 administration et collectivité 676\n", "17 11046.0 tour opérateur 642\n", "19 11048.0 entreprises 515\n", "72 11619.0 structures culturelles;musée, site & fondation 427\n", "8 11037.0 handicap 426" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tags_information(\"1\", 20)" ] }, { "cell_type": "code", "execution_count": 43, "id": "bd5bef41-1774-4601-86b5-b7c1aea8f1d2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Nombre de client avec tag : 5953\n", "Proportion de clients avec tags : 0.021598421025897787\n", "Moyenne de tags par client : 1.0\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tag_idtag_namecustomer_id
01.0training-sb-ax5
\n", "
" ], "text/plain": [ " tag_id tag_name customer_id\n", "0 1.0 training-sb-ax 5" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tags_information(\"2\", 20)" ] }, { "cell_type": "code", "execution_count": 39, "id": "7c2dc3e6-1418-44db-a8c0-4a9d59ec5232", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idname
01training-sb-ax
12NaN
\n", "
" ], "text/plain": [ " id name\n", "0 1 training-sb-ax\n", "1 2 NaN" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "load_dataset_2(\"2\", \"tags\")[['id', 'name']]" ] }, { "cell_type": "code", "execution_count": 42, "id": "c7b2c670-7122-4f67-b1aa-8c80a10f16d8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Nombre de client avec tag : 23659\n", "Proportion de clients avec tags : 0.09207484608139978\n", "Moyenne de tags par client : 3.0620482691576143\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tag_idtag_namecustomer_id
16444539.0*individuel/particulier13148
3026926.0ce3216
146995.0college2126
167028.0lycee1577
15444524.0iraiser1453
26714.0ecole primaire1200
15544525.0bp1094
157024.0centre de loisirs1080
15344515.0entreprise998
12644039.0ca fondation d'aumale891
15244514.0particulier838
3643663.0président816
7643703.0directeur812
15844528.0dc807
5443681.0présidente805
14944511.0entreprise (financier)805
9043718.0conseillère régionale déléguée titulaire804
4043667.0directeur de l'agence801
7843705.0sous-préfet798
10043728.0chargée de mission paysage797
\n", "
" ], "text/plain": [ " tag_id tag_name customer_id\n", "164 44539.0 *individuel/particulier 13148\n", "30 26926.0 ce 3216\n", "14 6995.0 college 2126\n", "16 7028.0 lycee 1577\n", "154 44524.0 iraiser 1453\n", "2 6714.0 ecole primaire 1200\n", "155 44525.0 bp 1094\n", "15 7024.0 centre de loisirs 1080\n", "153 44515.0 entreprise 998\n", "126 44039.0 ca fondation d'aumale 891\n", "152 44514.0 particulier 838\n", "36 43663.0 président 816\n", "76 43703.0 directeur 812\n", "158 44528.0 dc 807\n", "54 43681.0 présidente 805\n", "149 44511.0 entreprise (financier) 805\n", "90 43718.0 conseillère régionale déléguée titulaire 804\n", "40 43667.0 directeur de l'agence 801\n", "78 43705.0 sous-préfet 798\n", "100 43728.0 chargée de mission paysage 797" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tags_information(\"3\", 20)" ] }, { "cell_type": "code", "execution_count": 40, "id": "76639995-252d-4a58-83d8-c0c00900c3a9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Nombre de client avec tag : 10495\n", "Proportion de clients avec tags : 0.03271416949025744\n", "Moyenne de tags par client : 5.298427822772749\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tag_idtag_namecustomer_id
147298.0jhima4219
146297.0colloque algérie3851
142292.0i&ma3826
154305.0mardis de la philo3674
150301.0le grand continant3670
144295.0araborama3669
155306.0marie descourtieux3669
145296.0c'était la guerre d'algérie3669
141291.0araborama 33669
102198.0association de collectivités territoriales spé...3669
143294.0arabofolies3669
103199.0rassemble les 11 000 élus de toute la france a...3669
250.0association463
654.0collège446
149.0ecole374
755.0lycée275
553.0centre social200
53130.0cultures et arts141
351.0mairie136
1364.0formation_ima_ax87
\n", "
" ], "text/plain": [ " tag_id tag_name customer_id\n", "147 298.0 jhima 4219\n", "146 297.0 colloque algérie 3851\n", "142 292.0 i&ma 3826\n", "154 305.0 mardis de la philo 3674\n", "150 301.0 le grand continant 3670\n", "144 295.0 araborama 3669\n", "155 306.0 marie descourtieux 3669\n", "145 296.0 c'était la guerre d'algérie 3669\n", "141 291.0 araborama 3 3669\n", "102 198.0 association de collectivités territoriales spé... 3669\n", "143 294.0 arabofolies 3669\n", "103 199.0 rassemble les 11 000 élus de toute la france a... 3669\n", "2 50.0 association 463\n", "6 54.0 collège 446\n", "1 49.0 ecole 374\n", "7 55.0 lycée 275\n", "5 53.0 centre social 200\n", "53 130.0 cultures et arts 141\n", "3 51.0 mairie 136\n", "13 64.0 formation_ima_ax 87" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tags_information(\"4\", 20)" ] }, { "cell_type": "code", "execution_count": 41, "id": "07e91791-d4d4-42b1-ac18-22d3b0b9f7bd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Nombre de client avec tag : 532342\n", "Proportion de clients avec tags : 0.18660686931118298\n", "Moyenne de tags par client : 24.114082676174338\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tag_idtag_namecustomer_id
20349.0clients internet517491
24356.0associations / clubs495520
510.0agence de voyages493774
32410.0guides conférenciers493378
26360.0groupe amis ou famille493021
23354.0ce / entreprises493016
817.0association/club493008
13.0c.e. / entreprise492656
611.0college492552
1369.0tour operator492549
49.0ecole primaire492540
31379.0parent goûter anniversaire492468
30364.0institutions492364
26.0institution492321
18186.0autocaristes492153
713.0enseignement superieur492131
25359.0hotels / campings492078
427186.0individuel491913
37.0groupe amis / famille491900
02.0client internet491896
\n", "
" ], "text/plain": [ " tag_id tag_name customer_id\n", "20 349.0 clients internet 517491\n", "24 356.0 associations / clubs 495520\n", "5 10.0 agence de voyages 493774\n", "32 410.0 guides conférenciers 493378\n", "26 360.0 groupe amis ou famille 493021\n", "23 354.0 ce / entreprises 493016\n", "8 17.0 association/club 493008\n", "1 3.0 c.e. / entreprise 492656\n", "6 11.0 college 492552\n", "13 69.0 tour operator 492549\n", "4 9.0 ecole primaire 492540\n", "31 379.0 parent goûter anniversaire 492468\n", "30 364.0 institutions 492364\n", "2 6.0 institution 492321\n", "18 186.0 autocaristes 492153\n", "7 13.0 enseignement superieur 492131\n", "25 359.0 hotels / campings 492078\n", "42 7186.0 individuel 491913\n", "3 7.0 groupe amis / famille 491900\n", "0 2.0 client internet 491896" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tags_information(\"101\", 20)" ] }, { "cell_type": "markdown", "id": "87d131cd-ead0-4ef4-a8ee-b09022d08ffa", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## KPI product" ] }, { "cell_type": "code", "execution_count": 12, "id": "26582be9-cfd1-48ea-a0a7-31101fdeb9d1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_1/products_purchased_reduced.csv\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ticket_idcustomer_idpurchase_idevent_type_idsupplier_namepurchase_dateamountis_full_pricename_event_typesname_facilitiesname_categoriesname_eventsname_seasonsstart_date_timeend_date_timeopen
0130708594818751074624vente en ligne2018-12-28 14:47:50+00:008.0Falsespectacle vivantmucemindiv prog enfantl'école des magiciens20182018-12-31 14:15:00+01:001901-01-01 00:09:21+00:09True
1130708604818751074624vente en ligne2018-12-28 14:47:50+00:004.0Falsespectacle vivantmucemindiv prog enfantl'école des magiciens20182018-12-31 14:15:00+01:001901-01-01 00:09:21+00:09True
2130708614818751074624vente en ligne2018-12-28 14:47:50+00:004.0Falsespectacle vivantmucemindiv prog enfantl'école des magiciens20182018-12-31 14:15:00+01:001901-01-01 00:09:21+00:09True
3130708624818751074624vente en ligne2018-12-28 14:47:50+00:004.0Falsespectacle vivantmucemindiv prog enfantl'école des magiciens20182018-12-31 14:15:00+01:001901-01-01 00:09:21+00:09True
4130708634818751074624vente en ligne2018-12-28 14:47:50+00:004.0Falsespectacle vivantmucemindiv prog enfantl'école des magiciens20182018-12-31 14:15:00+01:001901-01-01 00:09:21+00:09True
\n", "
" ], "text/plain": [ " ticket_id customer_id purchase_id event_type_id supplier_name \\\n", "0 13070859 48187 5107462 4 vente en ligne \n", "1 13070860 48187 5107462 4 vente en ligne \n", "2 13070861 48187 5107462 4 vente en ligne \n", "3 13070862 48187 5107462 4 vente en ligne \n", "4 13070863 48187 5107462 4 vente en ligne \n", "\n", " purchase_date amount is_full_price name_event_types \\\n", "0 2018-12-28 14:47:50+00:00 8.0 False spectacle vivant \n", "1 2018-12-28 14:47:50+00:00 4.0 False spectacle vivant \n", "2 2018-12-28 14:47:50+00:00 4.0 False spectacle vivant \n", "3 2018-12-28 14:47:50+00:00 4.0 False spectacle vivant \n", "4 2018-12-28 14:47:50+00:00 4.0 False spectacle vivant \n", "\n", " name_facilities name_categories name_events name_seasons \\\n", "0 mucem indiv prog enfant l'école des magiciens 2018 \n", "1 mucem indiv prog enfant l'école des magiciens 2018 \n", "2 mucem indiv prog enfant l'école des magiciens 2018 \n", "3 mucem indiv prog enfant l'école des magiciens 2018 \n", "4 mucem indiv prog enfant l'école des magiciens 2018 \n", "\n", " start_date_time end_date_time open \n", "0 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True \n", "1 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True \n", "2 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True \n", "3 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True \n", "4 2018-12-31 14:15:00+01:00 1901-01-01 00:09:21+00:09 True " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tenant_id = \"1\"\n", "\n", "df_product = display_databases(tenant_id, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n", "\n", "df_product.head()" ] }, { "cell_type": "code", "execution_count": 24, "id": "533bf499-dd56-4d29-b261-ca1e4928c9c7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name_event_typesname_eventsticket_idprop_tickets
118offre muséale groupevisite générale du mucem (1h30)438140.024
212offre muséale individuelvisite autonome scolaires (2h00)344230.019
68offre muséale groupevisite autonome exposition (1h30)264890.015
210offre muséale individuelvisite autonome adultes (2h00)220650.012
160offre muséale groupevisites des exterieurs scolaires155950.009
...............
364spectacle vivantkay ! lettres à un poète disparu10.000
443spectacle vivantmauvais genre10.000
375spectacle vivantla madre que parió a la música10.000
260spectacle vivantali a les yeux bleus (dès 12 ans)10.000
484spectacle vivantrengaine (dès 12 ans)10.000
\n", "

544 rows × 4 columns

\n", "
" ], "text/plain": [ " name_event_types name_events ticket_id \\\n", "118 offre muséale groupe visite générale du mucem (1h30) 43814 \n", "212 offre muséale individuel visite autonome scolaires (2h00) 34423 \n", "68 offre muséale groupe visite autonome exposition (1h30) 26489 \n", "210 offre muséale individuel visite autonome adultes (2h00) 22065 \n", "160 offre muséale groupe visites des exterieurs scolaires 15595 \n", ".. ... ... ... \n", "364 spectacle vivant kay ! lettres à un poète disparu 1 \n", "443 spectacle vivant mauvais genre 1 \n", "375 spectacle vivant la madre que parió a la música 1 \n", "260 spectacle vivant ali a les yeux bleus (dès 12 ans) 1 \n", "484 spectacle vivant rengaine (dès 12 ans) 1 \n", "\n", " prop_tickets \n", "118 0.024 \n", "212 0.019 \n", "68 0.015 \n", "210 0.012 \n", "160 0.009 \n", ".. ... \n", "364 0.000 \n", "443 0.000 \n", "375 0.000 \n", "260 0.000 \n", "484 0.000 \n", "\n", "[544 rows x 4 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nb_tickets_per_events = df_product.groupby(['name_event_types', 'name_events'])['ticket_id'].count().reset_index().sort_values('ticket_id', ascending = False)\n", "nb_tickets_per_events['prop_tickets'] = round(nb_tickets_per_events['ticket_id']/len(df_product), 3)\n", "nb_tickets_per_events" ] }, { "cell_type": "markdown", "id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "# Fin travail 25/02" ] }, { "cell_type": "markdown", "id": "c437eaec", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "# Exemple sur Company 1" ] }, { "cell_type": "markdown", "id": "a1c1fc39", "metadata": {}, "source": [ "## Chargement données" ] }, { "cell_type": "code", "execution_count": 3, "id": "66f8c17b", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data/1\"\n", "liste_database = fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 5, "id": "c08e6798", "metadata": {}, "outputs": [], "source": [ "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n", "\n", "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n", "\n", "# Afficher le résultat\n", "print(liste_database_filtered)" ] }, { "cell_type": "code", "execution_count": 6, "id": "675f518d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "<<<<<<< local \n", "/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n", "=======\n", "/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n", ">>>>>>> remote \n" ] } ], "source": [ "# loop to create dataframes from liste\n", "files_path = liste_database\n", "\n", "client_number = files_path[0].split(\"/\")[1]\n", "df_prefix = \"df\" + str(client_number) + \"_\"\n", "\n", "for i in range(len(files_path)) :\n", " current_path = files_path[i]\n", " with fs.open(current_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in)\n", " # the pattern of the name is df1xxx\n", " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", " globals()[nom_dataframe] = df" ] }, { "cell_type": "markdown", "id": "e855f403", "metadata": {}, "source": [ "## customersplus.csv" ] }, { "cell_type": "code", "execution_count": 22, "id": "91a8f8c4", "metadata": {}, "outputs": [], "source": [ "a = pd.DataFrame(df1_customersplus.info())" ] }, { "cell_type": "code", "execution_count": 31, "id": "2fda171d", "metadata": {}, "outputs": [], "source": [ "def info_colonnes_dataframe(df):\n", " # Créer une liste pour stocker les informations sur chaque colonne\n", " infos_colonnes = []\n", "\n", " # Parcourir les colonnes du DataFrame\n", " for nom_colonne, serie in df.items(): # Utiliser items() au lieu de iteritems()\n", " # Calculer le taux de valeurs manquantes\n", " taux_na = serie.isna().mean() * 100\n", "\n", " # Ajouter les informations à la liste\n", " infos_colonnes.append({\n", " 'Nom_colonne': nom_colonne,\n", " 'Type_colonne': str(serie.dtype),\n", " 'Taux_NA': taux_na\n", " })\n", "\n", " # Créer une nouvelle DataFrame à partir de la liste d'informations\n", " df_infos_colonnes = pd.DataFrame(infos_colonnes)\n", "\n", " return df_infos_colonnes" ] }, { "cell_type": "code", "execution_count": 35, "id": "205eeeab", "metadata": {}, "outputs": [], "source": [ "def cleaning_date(df, column_name):\n", " \"\"\"\n", " Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n", "\n", " Parameters:\n", " - df: DataFrame\n", " Le DataFrame contenant la colonne à nettoyer.\n", " - column_name: str\n", " Le nom de la colonne à nettoyer.\n", "\n", " Returns:\n", " - DataFrame\n", " Le DataFrame modifié avec la colonne nettoyée.\n", " \"\"\"\n", " df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n", " return df" ] }, { "cell_type": "code", "execution_count": 32, "id": "634282c5", "metadata": {}, "outputs": [], "source": [ "a = info_colonnes_dataframe(df1_customersplus)" ] }, { "cell_type": "code", "execution_count": 33, "id": "0e8d4133", "metadata": {}, "outputs": [], "source": [ "a" ] }, { "cell_type": "code", "execution_count": 16, "id": "1268ad5a", "metadata": {}, "outputs": [], "source": [ "a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)" ] }, { "cell_type": "code", "execution_count": 40, "id": "bd41dc80", "metadata": {}, "outputs": [], "source": [ "# Selection des variables\n", "df1_customersplus_clean = df1_customersplus.copy()\n", "\n", "cleaning_date(df1_customersplus_clean, 'first_buying_date')\n", "cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n", "\n", "df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n", "df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n", "\n" ] }, { "cell_type": "markdown", "id": "64d0f76b", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## tickets.csv" ] }, { "cell_type": "code", "execution_count": 6, "id": "7e683711", "metadata": {}, "outputs": [], "source": [ "df1_tickets" ] }, { "cell_type": "code", "execution_count": 7, "id": "e7b9a52e", "metadata": {}, "outputs": [], "source": [ "df1_tickets.info()" ] }, { "cell_type": "code", "execution_count": 8, "id": "568280e8", "metadata": {}, "outputs": [], "source": [ "df1_tickets.isna().sum()/len(df1_tickets)*100" ] }, { "cell_type": "code", "execution_count": 9, "id": "29ecec90", "metadata": {}, "outputs": [], "source": [ "# Selection des variables\n", "df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n", "df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)" ] }, { "cell_type": "markdown", "id": "22bb5de4", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## suppliers.csv" ] }, { "cell_type": "code", "execution_count": 10, "id": "6a9a91f4", "metadata": {}, "outputs": [], "source": [ "df1_suppliers" ] }, { "cell_type": "code", "execution_count": 11, "id": "bab4758a", "metadata": {}, "outputs": [], "source": [ "df1_suppliers.info()" ] }, { "cell_type": "code", "execution_count": 12, "id": "b5fff251", "metadata": {}, "outputs": [], "source": [ "df1_suppliers.isna().sum()/len(df1_suppliers)*100" ] }, { "cell_type": "code", "execution_count": 13, "id": "8b09e2a3", "metadata": {}, "outputs": [], "source": [ "# Selection des variables\n", "df1_suppliers_clean = df1_suppliers[['id', 'name']]\n", "df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)" ] }, { "cell_type": "code", "execution_count": 14, "id": "ecee7cdc", "metadata": {}, "outputs": [], "source": [ "df1_suppliers_clean" ] }, { "cell_type": "markdown", "id": "c8e6e69b", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## type_ofs.csv" ] }, { "cell_type": "code", "execution_count": 15, "id": "1a6cff1f", "metadata": {}, "outputs": [], "source": [ "df1_type_ofs" ] }, { "cell_type": "code", "execution_count": 16, "id": "93630b41", "metadata": {}, "outputs": [], "source": [ "df1_type_ofs.info()" ] }, { "cell_type": "code", "execution_count": 17, "id": "4f94481a", "metadata": {}, "outputs": [], "source": [ "# Selection des variables\n", "df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n", "df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)" ] }, { "cell_type": "markdown", "id": "1b2811e2", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## purchases.csv" ] }, { "cell_type": "code", "execution_count": 18, "id": "2455d2e1", "metadata": { "scrolled": true }, "outputs": [], "source": [ "df1_purchases" ] }, { "cell_type": "code", "execution_count": 19, "id": "5f9a159d", "metadata": {}, "outputs": [], "source": [ "df1_purchases.info()" ] }, { "cell_type": "code", "execution_count": 20, "id": "db201bf7", "metadata": {}, "outputs": [], "source": [ "# Nettoyage purchase_date\n", "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n", "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')" ] }, { "cell_type": "code", "execution_count": 21, "id": "bd436fca", "metadata": {}, "outputs": [], "source": [ "df1_purchases.info()" ] }, { "cell_type": "code", "execution_count": 22, "id": "83435862", "metadata": {}, "outputs": [], "source": [ "# Selection des variables\n", "df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]" ] }, { "cell_type": "markdown", "id": "f210e730", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Fusion de l'ensemble des données billétiques" ] }, { "cell_type": "code", "execution_count": 23, "id": "1f8b3aa7", "metadata": {}, "outputs": [], "source": [ "# Fusion avec fournisseurs\n", "df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", "df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", "\n", "# Fusion avec type de tickets\n", "df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n", "df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", "\n", "# Fusion avec achats\n", "df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", "df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 24, "id": "83a4d021", "metadata": { "scrolled": true }, "outputs": [], "source": [ "df1_ticket_information" ] }, { "cell_type": "markdown", "id": "56e6ebd1", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "# Utilisation de fonctions" ] }, { "cell_type": "code", "execution_count": 51, "id": "88fcde4b", "metadata": {}, "outputs": [], "source": [ "# Créer un DataFrame exemple\n", "df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n", "\n", "# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n", "df_clean = cleaning_date(df_not_clean, 'opened_at')\n", "df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n", "\n", "test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n", "\n", "test.info()" ] }, { "cell_type": "markdown", "id": "818f69db", "metadata": {}, "source": [ "## Nettoyage, selection et fusion" ] }, { "cell_type": "code", "execution_count": 23, "id": "c9654eda", "metadata": {}, "outputs": [], "source": [ "df1_ticket_information" ] }, { "cell_type": "code", "execution_count": 14, "id": "7f2b620c", "metadata": {}, "outputs": [], "source": [ "df1_ticket_information.info()" ] }, { "cell_type": "markdown", "id": "637bdb72", "metadata": {}, "source": [ "# Customer information" ] }, { "cell_type": "markdown", "id": "14c52894", "metadata": {}, "source": [ "## Target area" ] }, { "cell_type": "code", "execution_count": 8, "id": "d83abfbf", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n" ] } ], "source": [ "# Target.csv cleaning\n", "df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n", "df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n", "\n", "# target_type cleaning\n", "df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n", "\n", "#customer_target_mappings cleaning\n", "df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n", "\n", "# Merge target et target_type\n", "df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n", "df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n", "\n", "# Merge\n", "df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n", "df1_targets_full.drop(['target_id'], axis = 1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 62, "id": "90d71b2c", "metadata": {}, "outputs": [], "source": [ "df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n", "len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n", "\n", "# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n", "df1_targets_test.mean()\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "2301de1e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcustomer_idtarget_nametarget_type_is_importtarget_type_name
01184824645400DDCP PROMO Réseau livresFalsemanual_static_filter
12105712412DDCP PROMO Réseau livresFalsemanual_static_filter
22105724536DDCP PROMO Réseau livresFalsemanual_static_filter
32105736736DDCP PROMO Réseau livresFalsemanual_static_filter
421057438210DDCP PROMO Réseau livresFalsemanual_static_filter
\n", "
" ], "text/plain": [ " id customer_id target_name target_type_is_import \\\n", "0 1184824 645400 DDCP PROMO Réseau livres False \n", "1 210571 2412 DDCP PROMO Réseau livres False \n", "2 210572 4536 DDCP PROMO Réseau livres False \n", "3 210573 6736 DDCP PROMO Réseau livres False \n", "4 210574 38210 DDCP PROMO Réseau livres False \n", "\n", " target_type_name \n", "0 manual_static_filter \n", "1 manual_static_filter \n", "2 manual_static_filter \n", "3 manual_static_filter \n", "4 manual_static_filter " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_targets_full.head()" ] }, { "cell_type": "code", "execution_count": 14, "id": "75fbc2f7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Catégorisation des target_name\n", "import pandas as pd\n", "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "from nltk.probability import FreqDist\n", "\n", "# Téléchargement des ressources nécessaires\n", "nltk.download('punkt')\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')\n", "\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "55cddf92", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mots les plus fréquents:\n", "consentement: 550777\n", "optin: 463579\n", "jeune: 155103\n", "public: 155103\n", "mediation: 150001\n" ] } ], "source": [ "# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n", "def preprocess_text(texte):\n", " # Concaténation des éléments de la liste en une seule chaîne de caractères\n", " texte_concat = ' '.join(texte)\n", " \n", " # Tokenisation des mots\n", " tokens = word_tokenize(texte_concat.lower())\n", " \n", " # Suppression des mots vides (stopwords)\n", " stop_words = set(stopwords.words('french'))\n", " filtered_tokens = [word for word in tokens if word not in stop_words]\n", " \n", " # Lemmatisation des mots\n", " lemmatizer = WordNetLemmatizer()\n", " lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n", " \n", " return lemmatized_tokens\n", "\n", "\n", "# Appliquer le prétraitement à la colonne de texte\n", "df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n", "\n", "# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n", "all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n", "\n", "# Calculer la fréquence des mots\n", "freq_dist = FreqDist(all_words)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 22, "id": "7fd98a85", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mots les plus fréquents:\n", "consentement: 550777\n", "optin: 463579\n", "jeune: 155103\n", "public: 155103\n", "mediation: 150001\n", "specialisee: 150001\n", "b2c: 143432\n", "optout: 97683\n", "newsletter: 56022\n", "(: 46084\n", "): 46084\n", "inscrits: 42296\n", "nl: 42294\n", "générale: 41037\n", "generale: 40950\n" ] } ], "source": [ "# Affichage des mots les plus fréquents\n", "print(\"Mots les plus fréquents:\")\n", "for mot, freq in freq_dist.most_common(15):\n", " print(f\"{mot}: {freq}\")" ] }, { "cell_type": "code", "execution_count": 18, "id": "cf94bb1d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " texte \\\n", "0 Le chat noir mange une souris. \n", "1 Le chien blanc aboie. \n", "\n", " texte_preprocessed \n", "0 [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .] \n", "1 [e, h, i, e, b, a, a, b, o, i, e, .] \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ] } ], "source": [ "import pandas as pd\n", "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", "from nltk.stem import WordNetLemmatizer\n", "\n", "# Téléchargement des ressources nécessaires\n", "nltk.download('punkt')\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')\n", "\n", "# Création de la DataFrame d'exemple\n", "data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n", "df = pd.DataFrame(data)\n", "\n", "# Fonction pour prétraiter le texte\n", "def preprocess_text(texte):\n", " # Concaténation des éléments de la liste en une seule chaîne de caractères\n", " texte_concat = ' '.join(texte)\n", " \n", " # Tokenisation des mots\n", " tokens = word_tokenize(texte_concat.lower())\n", " \n", " # Suppression des mots vides (stopwords)\n", " stop_words = set(stopwords.words('french'))\n", " filtered_tokens = [word for word in tokens if word not in stop_words]\n", " \n", " # Lemmatisation des mots\n", " lemmatizer = WordNetLemmatizer()\n", " lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n", " \n", " return lemmatized_tokens\n", "\n", "# Appliquer la fonction de prétraitement à la colonne de texte\n", "df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n", "\n", "# Afficher le résultat\n", "print(df)\n" ] }, { "cell_type": "markdown", "id": "711d3884", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Campaign area" ] }, { "cell_type": "code", "execution_count": 52, "id": "c25b5295", "metadata": {}, "outputs": [], "source": [ "# campaign_stats cleaning \n", "df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n", "cleaning_date(df1_campaign_stats_clean, 'opened_at')\n", "cleaning_date(df1_campaign_stats_clean, 'sent_at')\n", "cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n", "\n", "# campaigns cleaning\n", "df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n", "cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n", "\n", "# Merge \n", "df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n", "df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 53, "id": "2a3de6a5", "metadata": {}, "outputs": [], "source": [ "df1_campaigns_full.info()" ] }, { "cell_type": "code", "execution_count": 56, "id": "3fc1f446", "metadata": {}, "outputs": [], "source": [ "df1_campaigns_information" ] }, { "cell_type": "markdown", "id": "20e69ee3", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Link area" ] }, { "cell_type": "code", "execution_count": 37, "id": "d9cbdbce", "metadata": {}, "outputs": [], "source": [ "df1_campaigns" ] }, { "cell_type": "code", "execution_count": 38, "id": "c07459f0", "metadata": {}, "outputs": [], "source": [ "df1_link_stats" ] }, { "cell_type": "markdown", "id": "80ae4c42", "metadata": { "jp-MarkdownHeadingCollapsed": true }, "source": [ "## Supplier" ] }, { "cell_type": "code", "execution_count": 7, "id": "b50b8f95", "metadata": {}, "outputs": [], "source": [ "# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n", "def suppliers_exploration(suppliers = None) : \n", " \n", " # Taux de NaN pour ces colonnes\n", " label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n", " itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n", " commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n", "\n", " suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n", " 'label_na' : [label_na],\n", " 'itr_na' : [itr_na],\n", " 'commission_na' : [commission_na]})\n", "\n", " return suppliers_desc" ] }, { "cell_type": "code", "execution_count": 8, "id": "7e292935", "metadata": {}, "outputs": [], "source": [ "df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)" ] }, { "cell_type": "code", "execution_count": 9, "id": "05b6f2b0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nb_supplierslabel_naitr_nacommission_na
09100.0100.0100.0
\n", "
" ], "text/plain": [ " nb_suppliers label_na itr_na commission_na\n", "0 9 100.0 100.0 100.0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1_suppliers_desc" ] }, { "cell_type": "code", "execution_count": 10, "id": "c9324d80", "metadata": {}, "outputs": [], "source": [ "BUCKET = \"bdc2324-data\"\n", "liste_folders = fs.ls(BUCKET)\n", "\n", "liste_files = []\n", "for company_folder in liste_folders : \n", " liste_files.extend(fs.ls(company_folder))" ] }, { "cell_type": "code", "execution_count": 11, "id": "10304058", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n" ] } ], "source": [ "liste_database_select = ['suppliers']\n", "\n", "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", "liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n", "\n", "# Afficher le résultat\n", "print(liste_suppliers)" ] }, { "cell_type": "code", "execution_count": 32, "id": "ffa423e5", "metadata": {}, "outputs": [], "source": [ "# loop to create dataframes from file 2\n", "def database_loading(database_name = None):\n", " files_path = database_name\n", " \n", " client_number = files_path.split(\"/\")[1]\n", " df_prefix = \"df\" + str(client_number) + \"_\"\n", " \n", " current_path = files_path\n", " with fs.open(current_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in)\n", "\n", " return df, client_number" ] }, { "cell_type": "code", "execution_count": null, "id": "70bdc88d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 45, "id": "6a0f567d", "metadata": {}, "outputs": [], "source": [ "df_all = pd.DataFrame()\n", "\n", "for link in liste_suppliers:\n", " \n", " df_supplier, tenant_id = database_loading(link)\n", " \n", " df_supplier['tenant_id'] = int(tenant_id)\n", "\n", " df_all = pd.concat([df_all, df_supplier], axis = 0)\n", " " ] }, { "cell_type": "code", "execution_count": 63, "id": "1522d8cd", "metadata": {}, "outputs": [], "source": [ "# df_all[df_all['tenant_id'] == 101]['name'].unique()" ] }, { "cell_type": "code", "execution_count": 66, "id": "b0e42a61", "metadata": {}, "outputs": [], "source": [ "liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n", "# vad = vente à distance\n", "df_all['name'] = df_all['name'].fillna('')\n", "\n", "df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n" ] }, { "cell_type": "code", "execution_count": 68, "id": "d299ae91", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tenant_id\n", "1 1\n", "2 1\n", "3 1\n", "4 1\n", "5 1\n", "6 1\n", "7 1\n", "8 1\n", "9 1\n", "10 1\n", "11 1\n", "12 1\n", "13 1\n", "14 1\n", "101 1\n", "Name: canal_vente_internet, dtype: int64" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_all.groupby('tenant_id')['canal_vente_internet'].max()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }