From e1f6f1ba68feda2f5f276f3a855433bf7724f9e8 Mon Sep 17 00:00:00 2001 From: ajoubrel-ensae Date: Tue, 26 Mar 2024 10:49:09 +0000 Subject: [PATCH] Ajout graph pour targets --- Exploration_billet_AJ.ipynb | 602 +----------------------------------- 1 file changed, 14 insertions(+), 588 deletions(-) diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb index b8c82e7..64402cd 100644 --- a/Exploration_billet_AJ.ipynb +++ b/Exploration_billet_AJ.ipynb @@ -100,7 +100,9 @@ { "cell_type": "markdown", "id": "ccf597b0-b459-4ea5-baf0-5ba8c90915e4", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "# Cleaning target area and tags" ] @@ -551,16 +553,6 @@ "## Base communes au types Musée" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b", - "metadata": {}, - "outputs": [], - "source": [ - "companies['musee']" - ] - }, { "cell_type": "code", "execution_count": null, @@ -789,7 +781,9 @@ { "cell_type": "markdown", "id": "76bffba1-5f7e-4308-9224-437ca66148f8", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## KPI sur target_type" ] @@ -1230,74 +1224,17 @@ { "cell_type": "markdown", "id": "c437eaec", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, + "metadata": {}, "source": [ "# Exemple sur Company 1" ] }, - { - "cell_type": "markdown", - "id": "a1c1fc39", - "metadata": {}, - "source": [ - "## Chargement données" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66f8c17b", - "metadata": {}, - "outputs": [], - "source": [ - "BUCKET = \"bdc2324-data/1\"\n", - "liste_database = fs.ls(BUCKET)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c08e6798", - "metadata": {}, - "outputs": [], - "source": [ - "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n", - "\n", - "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", - "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n", - "\n", - "# Afficher le résultat\n", - "print(liste_database_filtered)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "675f518d", - "metadata": {}, - "outputs": [], - "source": [ - "# loop to create dataframes from liste\n", - "files_path = liste_database\n", - "\n", - "client_number = files_path[0].split(\"/\")[1]\n", - "df_prefix = \"df\" + str(client_number) + \"_\"\n", - "\n", - "for i in range(len(files_path)) :\n", - " current_path = files_path[i]\n", - " with fs.open(current_path, mode=\"rb\") as file_in:\n", - " df = pd.read_csv(file_in)\n", - " # the pattern of the name is df1xxx\n", - " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", - " globals()[nom_dataframe] = df" - ] - }, { "cell_type": "markdown", "id": "e855f403", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ "## customersplus.csv" ] @@ -1414,172 +1351,6 @@ "\n" ] }, - { - "cell_type": "markdown", - "id": "64d0f76b", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## tickets.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e683711", - "metadata": {}, - "outputs": [], - "source": [ - "df1_tickets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7b9a52e", - "metadata": {}, - "outputs": [], - "source": [ - "df1_tickets.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "568280e8", - "metadata": {}, - "outputs": [], - "source": [ - "df1_tickets.isna().sum()/len(df1_tickets)*100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "29ecec90", - "metadata": {}, - "outputs": [], - "source": [ - "# Selection des variables\n", - "df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n", - "df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)" - ] - }, - { - "cell_type": "markdown", - "id": "22bb5de4", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## suppliers.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a9a91f4", - "metadata": {}, - "outputs": [], - "source": [ - "df1_suppliers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bab4758a", - "metadata": {}, - "outputs": [], - "source": [ - "df1_suppliers.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b5fff251", - "metadata": {}, - "outputs": [], - "source": [ - "df1_suppliers.isna().sum()/len(df1_suppliers)*100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b09e2a3", - "metadata": {}, - "outputs": [], - "source": [ - "# Selection des variables\n", - "df1_suppliers_clean = df1_suppliers[['id', 'name']]\n", - "df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ecee7cdc", - "metadata": {}, - "outputs": [], - "source": [ - "df1_suppliers_clean" - ] - }, - { - "cell_type": "markdown", - "id": "c8e6e69b", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## type_ofs.csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a6cff1f", - "metadata": {}, - "outputs": [], - "source": [ - "df1_type_ofs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "93630b41", - "metadata": {}, - "outputs": [], - "source": [ - "df1_type_ofs.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f94481a", - "metadata": {}, - "outputs": [], - "source": [ - "# Selection des variables\n", - "df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n", - "df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)" - ] - }, - { - "cell_type": "markdown", - "id": "1b2811e2", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## purchases.csv" - ] - }, { "cell_type": "code", "execution_count": null, @@ -1635,105 +1406,6 @@ "df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]" ] }, - { - "cell_type": "markdown", - "id": "f210e730", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## Fusion de l'ensemble des données billétiques" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f8b3aa7", - "metadata": {}, - "outputs": [], - "source": [ - "# Fusion avec fournisseurs\n", - "df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", - "df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", - "\n", - "# Fusion avec type de tickets\n", - "df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n", - "df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", - "\n", - "# Fusion avec achats\n", - "df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", - "df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83a4d021", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "df1_ticket_information" - ] - }, - { - "cell_type": "markdown", - "id": "56e6ebd1", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "# Utilisation de fonctions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88fcde4b", - "metadata": {}, - "outputs": [], - "source": [ - "# Créer un DataFrame exemple\n", - "df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n", - "\n", - "# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n", - "df_clean = cleaning_date(df_not_clean, 'opened_at')\n", - "df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n", - "\n", - "test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n", - "\n", - "test.info()" - ] - }, - { - "cell_type": "markdown", - "id": "818f69db", - "metadata": {}, - "source": [ - "## Nettoyage, selection et fusion" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9654eda", - "metadata": {}, - "outputs": [], - "source": [ - "df1_ticket_information" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f2b620c", - "metadata": {}, - "outputs": [], - "source": [ - "df1_ticket_information.info()" - ] - }, { "cell_type": "markdown", "id": "637bdb72", @@ -1745,9 +1417,11 @@ { "cell_type": "markdown", "id": "14c52894", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, "source": [ - "## Target area" + "## Target area - NLP" ] }, { @@ -1919,254 +1593,6 @@ "# Afficher le résultat\n", "print(df)\n" ] - }, - { - "cell_type": "markdown", - "id": "711d3884", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## Campaign area" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c25b5295", - "metadata": {}, - "outputs": [], - "source": [ - "# campaign_stats cleaning \n", - "df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n", - "cleaning_date(df1_campaign_stats_clean, 'opened_at')\n", - "cleaning_date(df1_campaign_stats_clean, 'sent_at')\n", - "cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n", - "\n", - "# campaigns cleaning\n", - "df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n", - "cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n", - "\n", - "# Merge \n", - "df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n", - "df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a3de6a5", - "metadata": {}, - "outputs": [], - "source": [ - "df1_campaigns_full.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3fc1f446", - "metadata": {}, - "outputs": [], - "source": [ - "df1_campaigns_information" - ] - }, - { - "cell_type": "markdown", - "id": "20e69ee3", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## Link area" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d9cbdbce", - "metadata": {}, - "outputs": [], - "source": [ - "df1_campaigns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c07459f0", - "metadata": {}, - "outputs": [], - "source": [ - "df1_link_stats" - ] - }, - { - "cell_type": "markdown", - "id": "80ae4c42", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## Supplier" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b50b8f95", - "metadata": {}, - "outputs": [], - "source": [ - "# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n", - "def suppliers_exploration(suppliers = None) : \n", - " \n", - " # Taux de NaN pour ces colonnes\n", - " label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n", - " itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n", - " commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n", - "\n", - " suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n", - " 'label_na' : [label_na],\n", - " 'itr_na' : [itr_na],\n", - " 'commission_na' : [commission_na]})\n", - "\n", - " return suppliers_desc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e292935", - "metadata": {}, - "outputs": [], - "source": [ - "df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05b6f2b0", - "metadata": {}, - "outputs": [], - "source": [ - "df1_suppliers_desc" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9324d80", - "metadata": {}, - "outputs": [], - "source": [ - "BUCKET = \"bdc2324-data\"\n", - "liste_folders = fs.ls(BUCKET)\n", - "\n", - "liste_files = []\n", - "for company_folder in liste_folders : \n", - " liste_files.extend(fs.ls(company_folder))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10304058", - "metadata": {}, - "outputs": [], - "source": [ - "liste_database_select = ['suppliers']\n", - "\n", - "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", - "liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n", - "\n", - "# Afficher le résultat\n", - "print(liste_suppliers)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ffa423e5", - "metadata": {}, - "outputs": [], - "source": [ - "# loop to create dataframes from file 2\n", - "def database_loading(database_name = None):\n", - " files_path = database_name\n", - " \n", - " client_number = files_path.split(\"/\")[1]\n", - " df_prefix = \"df\" + str(client_number) + \"_\"\n", - " \n", - " current_path = files_path\n", - " with fs.open(current_path, mode=\"rb\") as file_in:\n", - " df = pd.read_csv(file_in)\n", - "\n", - " return df, client_number" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70bdc88d", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6a0f567d", - "metadata": {}, - "outputs": [], - "source": [ - "df_all = pd.DataFrame()\n", - "\n", - "for link in liste_suppliers:\n", - " \n", - " df_supplier, tenant_id = database_loading(link)\n", - " \n", - " df_supplier['tenant_id'] = int(tenant_id)\n", - "\n", - " df_all = pd.concat([df_all, df_supplier], axis = 0)\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1522d8cd", - "metadata": {}, - "outputs": [], - "source": [ - "# df_all[df_all['tenant_id'] == 101]['name'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0e42a61", - "metadata": {}, - "outputs": [], - "source": [ - "liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n", - "# vad = vente à distance\n", - "df_all['name'] = df_all['name'].fillna('')\n", - "\n", - "df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d299ae91", - "metadata": {}, - "outputs": [], - "source": [ - "df_all.groupby('tenant_id')['canal_vente_internet'].max()" - ] } ], "metadata": {