BDC-team-1/Exploration_billet_AJ.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5bf5c226",
   "metadata": {},
   "source": [
    "# Business Data Challenge - Team 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b1a5b9d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import s3fs\n",
    "import re\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ecfa2219",
   "metadata": {},
   "source": [
    "Configuration de l'accès aux données"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1a094277",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0294ce71-840e-458b-8ffa-cadabbc6da21",
   "metadata": {},
   "source": [
    "# Debut Travail 25/02"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "30d77451-2df6-4c07-8b15-66e0e990ff03",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
    "\n",
    "\n",
    "# Import cleaning and merge functions\n",
    "exec(open('0_KPI_functions.py').read())\n",
    "\n",
    "# Ignore warning\n",
    "warnings.filterwarnings('ignore')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset_2(directory_path, file_name):\n",
    "    \"\"\"\n",
    "    This function loads csv file\n",
    "    \"\"\"\n",
    "    file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "    # drop na :\n",
    "    #df = df.dropna(axis=1, thresh=len(df))\n",
    "    # if identifier in table : delete it\n",
    "    if 'identifier' in df.columns:\n",
    "        df = df.drop(columns = 'identifier')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "35da2e15-1e23-4653-a214-c6ff8f186e85",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_4/customerplus_cleaned.csv\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "      <th>street_id</th>\n",
       "      <th>structure_id</th>\n",
       "      <th>mcp_contact_id</th>\n",
       "      <th>fidelity</th>\n",
       "      <th>tenant_id</th>\n",
       "      <th>is_partner</th>\n",
       "      <th>deleted_at</th>\n",
       "      <th>gender</th>\n",
       "      <th>is_email_true</th>\n",
       "      <th>...</th>\n",
       "      <th>max_price</th>\n",
       "      <th>ticket_sum</th>\n",
       "      <th>average_price</th>\n",
       "      <th>average_purchase_delay</th>\n",
       "      <th>average_price_basket</th>\n",
       "      <th>average_ticket_basket</th>\n",
       "      <th>total_price</th>\n",
       "      <th>purchase_count</th>\n",
       "      <th>first_buying_date</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>479734</td>\n",
       "      <td>3587</td>\n",
       "      <td>NaN</td>\n",
       "      <td>184801.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1537</td>\n",
       "      <td>1352</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>504615</td>\n",
       "      <td>3587</td>\n",
       "      <td>NaN</td>\n",
       "      <td>152176.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3832780</td>\n",
       "      <td>3587</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3096540</td>\n",
       "      <td>3587</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>320804</th>\n",
       "      <td>2637745</td>\n",
       "      <td>406842</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2021-12-08 20:30:11+00:00</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>320805</th>\n",
       "      <td>23334</td>\n",
       "      <td>22677</td>\n",
       "      <td>NaN</td>\n",
       "      <td>185203.0</td>\n",
       "      <td>4</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13</td>\n",
       "      <td>11.692308</td>\n",
       "      <td>0.0</td>\n",
       "      <td>25.333333</td>\n",
       "      <td>2.166667</td>\n",
       "      <td>152.0</td>\n",
       "      <td>6</td>\n",
       "      <td>2018-05-02 07:47:40+00:00</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>320806</th>\n",
       "      <td>2641373</td>\n",
       "      <td>408068</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>12.0</td>\n",
       "      <td>4</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>48.000000</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>48.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2021-12-09 11:46:23+00:00</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>320807</th>\n",
       "      <td>2641469</td>\n",
       "      <td>408160</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>12.0</td>\n",
       "      <td>1</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>12.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2021-12-09 18:50:55+00:00</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>320808</th>\n",
       "      <td>2641474</td>\n",
       "      <td>408165</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>1342</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>12.0</td>\n",
       "      <td>1</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>12.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2021-12-09 19:02:42+00:00</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>320809 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        customer_id  street_id  structure_id  mcp_contact_id  fidelity  \\\n",
       "0            479734       3587           NaN        184801.0         0   \n",
       "1              1537       1352           NaN             NaN         0   \n",
       "2            504615       3587           NaN        152176.0         0   \n",
       "3           3832780       3587           NaN             NaN         0   \n",
       "4           3096540       3587           NaN             NaN         0   \n",
       "...             ...        ...           ...             ...       ...   \n",
       "320804      2637745     406842           NaN             NaN         1   \n",
       "320805        23334      22677           NaN        185203.0         4   \n",
       "320806      2641373     408068           NaN             NaN         1   \n",
       "320807      2641469     408160           NaN             NaN         1   \n",
       "320808      2641474     408165           NaN             NaN         1   \n",
       "\n",
       "        tenant_id  is_partner  deleted_at  gender  is_email_true  ...  \\\n",
       "0            1342       False         NaN       0           True  ...   \n",
       "1            1342       False         NaN       0           True  ...   \n",
       "2            1342       False         NaN       0           True  ...   \n",
       "3            1342       False         NaN       2           True  ...   \n",
       "4            1342       False         NaN       2           True  ...   \n",
       "...           ...         ...         ...     ...            ...  ...   \n",
       "320804       1342       False         NaN       0           True  ...   \n",
       "320805       1342       False         NaN       0           True  ...   \n",
       "320806       1342       False         NaN       0           True  ...   \n",
       "320807       1342       False         NaN       0           True  ...   \n",
       "320808       1342       False         NaN       0           True  ...   \n",
       "\n",
       "        max_price ticket_sum  average_price  average_purchase_delay  \\\n",
       "0             NaN          0            NaN                     NaN   \n",
       "1             NaN          0            NaN                     NaN   \n",
       "2             NaN          0            NaN                     NaN   \n",
       "3             NaN          0            NaN                     NaN   \n",
       "4             NaN          0            NaN                     NaN   \n",
       "...           ...        ...            ...                     ...   \n",
       "320804        0.0          2       0.000000                     2.0   \n",
       "320805       13.0         13      11.692308                     0.0   \n",
       "320806       12.0          4      12.000000                     0.0   \n",
       "320807       12.0          1      12.000000                     0.0   \n",
       "320808       12.0          1      12.000000                     0.0   \n",
       "\n",
       "        average_price_basket  average_ticket_basket  total_price  \\\n",
       "0                        NaN                    NaN          0.0   \n",
       "1                        NaN                    NaN          0.0   \n",
       "2                        NaN                    NaN          0.0   \n",
       "3                        NaN                    NaN          0.0   \n",
       "4                        NaN                    NaN          0.0   \n",
       "...                      ...                    ...          ...   \n",
       "320804              0.000000               2.000000          0.0   \n",
       "320805             25.333333               2.166667        152.0   \n",
       "320806             48.000000               4.000000         48.0   \n",
       "320807             12.000000               1.000000         12.0   \n",
       "320808             12.000000               1.000000         12.0   \n",
       "\n",
       "        purchase_count          first_buying_date  country  \n",
       "0                    0                        NaN       fr  \n",
       "1                    0                        NaN       fr  \n",
       "2                    0                        NaN       fr  \n",
       "3                    0                        NaN       fr  \n",
       "4                    0                        NaN       fr  \n",
       "...                ...                        ...      ...  \n",
       "320804               1  2021-12-08 20:30:11+00:00       fr  \n",
       "320805               6  2018-05-02 07:47:40+00:00       fr  \n",
       "320806               1  2021-12-09 11:46:23+00:00       fr  \n",
       "320807               1  2021-12-09 18:50:55+00:00       fr  \n",
       "320808               1  2021-12-09 19:02:42+00:00       fr  \n",
       "\n",
       "[320809 rows x 22 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "display_databases(\"4\", \"customerplus_cleaned\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "6c8ad8c3-25df-4fe4-9ad0-ee5f9498bc14",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.reset_option('display.max_rows')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "c897916c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>code</th>\n",
       "      <th>created_at</th>\n",
       "      <th>updated_at</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101</td>\n",
       "      <td>hongrie</td>\n",
       "      <td>hu</td>\n",
       "      <td>2023-06-13 11:17:40.600622+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.600622+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>albanie</td>\n",
       "      <td>al</td>\n",
       "      <td>2023-06-13 11:17:40.540652+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.540652+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>antarctique</td>\n",
       "      <td>aq</td>\n",
       "      <td>2023-06-13 11:17:40.541315+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.541315+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>autriche</td>\n",
       "      <td>at</td>\n",
       "      <td>2023-06-13 11:17:40.546711+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.546711+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>samoa américaines</td>\n",
       "      <td>as</td>\n",
       "      <td>2023-06-13 11:17:40.542569+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.542569+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>238</th>\n",
       "      <td>228</td>\n",
       "      <td>royaume-uni</td>\n",
       "      <td>gb</td>\n",
       "      <td>2023-06-13 11:17:40.678023+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.678023+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>239</th>\n",
       "      <td>25</td>\n",
       "      <td>brésil</td>\n",
       "      <td>br</td>\n",
       "      <td>2023-06-13 11:17:40.554209+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.554209+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>240</th>\n",
       "      <td>10</td>\n",
       "      <td>argentine</td>\n",
       "      <td>ar</td>\n",
       "      <td>2023-06-13 11:17:40.545489+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.545489+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>241</th>\n",
       "      <td>203</td>\n",
       "      <td>espagne</td>\n",
       "      <td>es</td>\n",
       "      <td>2023-06-13 11:17:40.662472+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.662472+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>242</th>\n",
       "      <td>192</td>\n",
       "      <td>arabie saoudite</td>\n",
       "      <td>sa</td>\n",
       "      <td>2023-06-13 11:17:40.656154+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.656154+02:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>243 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      id               name code                        created_at  \\\n",
       "0    101            hongrie   hu  2023-06-13 11:17:40.600622+02:00   \n",
       "1      2            albanie   al  2023-06-13 11:17:40.540652+02:00   \n",
       "2      3        antarctique   aq  2023-06-13 11:17:40.541315+02:00   \n",
       "3     12           autriche   at  2023-06-13 11:17:40.546711+02:00   \n",
       "4      5  samoa américaines   as  2023-06-13 11:17:40.542569+02:00   \n",
       "..   ...                ...  ...                               ...   \n",
       "238  228        royaume-uni   gb  2023-06-13 11:17:40.678023+02:00   \n",
       "239   25             brésil   br  2023-06-13 11:17:40.554209+02:00   \n",
       "240   10          argentine   ar  2023-06-13 11:17:40.545489+02:00   \n",
       "241  203            espagne   es  2023-06-13 11:17:40.662472+02:00   \n",
       "242  192    arabie saoudite   sa  2023-06-13 11:17:40.656154+02:00   \n",
       "\n",
       "                           updated_at  \n",
       "0    2023-06-13 11:17:40.600622+02:00  \n",
       "1    2023-06-13 11:17:40.540652+02:00  \n",
       "2    2023-06-13 11:17:40.541315+02:00  \n",
       "3    2023-06-13 11:17:40.546711+02:00  \n",
       "4    2023-06-13 11:17:40.542569+02:00  \n",
       "..                                ...  \n",
       "238  2023-06-13 11:17:40.678023+02:00  \n",
       "239  2023-06-13 11:17:40.554209+02:00  \n",
       "240  2023-06-13 11:17:40.545489+02:00  \n",
       "241  2023-06-13 11:17:40.662472+02:00  \n",
       "242  2023-06-13 11:17:40.656154+02:00  \n",
       "\n",
       "[243 rows x 5 columns]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "load_dataset_2(\"7\", \"countries\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Base communes au types Musée"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384",
   "metadata": {},
   "outputs": [],
   "source": [
    "companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
    "            'sport': ['5', '6', '7', '8', '9'],\n",
    "            'musique' : ['10', '11', '12', '13', '14']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b",
   "metadata": {},
   "outputs": [],
   "source": [
    "companies['musee']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5080f66e-f779-410a-876d-b4fe2795e17e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in companies['musique']:\n",
    "    BUCKET = \"bdc2324-data/\"+i\n",
    "    liste_base = []\n",
    "    for base in fs.ls(BUCKET):\n",
    "        match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
    "        if match:\n",
    "            nom_base = match.group(3)\n",
    "            liste_base.append(nom_base)\n",
    "    globals()['base_'+i] = liste_base\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abd477e1-7479-4c88-a5aa-f987af3f5b79",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Trouver l'intersection entre les cinq listes\n",
    "intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n",
    "\n",
    "# Convertir le résultat en liste si nécessaire\n",
    "intersection_liste = list(intersection)\n",
    "\n",
    "print(intersection_liste)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Trouver l'intersection entre les cinq listes\n",
    "intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n",
    "\n",
    "# Convertir le résultat en liste si nécessaire\n",
    "intersection_liste = list(intersection)\n",
    "\n",
    "print(intersection_liste)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(intersection_liste)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d058b21-a538-4f59-aefb-ef7966f73fdc",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_tags = load_dataset_2(\"1\", \"tags\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa441f99-733c-4675-8676-bed4682d3324",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6767a750-14a4-4c05-903e-d2f07170825b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_customersplus = load_dataset_2(\"1\", \"customersplus\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "125e9145-a815-46fd-bdf4-07589508b259",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c17a6976-792f-474d-bcff-c89396eddb3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecfc155a-cb42-46ec-8da5-33fdcd087355",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df1_structure_tag_mappings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "071410b8-950d-4fcc-b2b9-57415253c286",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "?np.sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def info_colonnes_dataframe(df):\n",
    "    # Créer une liste pour stocker les informations sur chaque colonne\n",
    "    infos_colonnes = []\n",
    "\n",
    "    # Parcourir les colonnes du DataFrame\n",
    "    for nom_colonne, serie in df.items():  # Utiliser items() au lieu de iteritems()\n",
    "        # Calculer le taux de valeurs manquantes\n",
    "        taux_na = serie.isna().mean() * 100\n",
    "\n",
    "        # Ajouter les informations à la liste\n",
    "        infos_colonnes.append({\n",
    "            'Nom_colonne': nom_colonne,\n",
    "            'Type_colonne': str(serie.dtype),\n",
    "            'Taux_NA': taux_na\n",
    "        })\n",
    "\n",
    "    # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
    "    df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
    "\n",
    "    return df_infos_colonnes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "info_colonnes_dataframe(df1_tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980",
   "metadata": {},
   "outputs": [],
   "source": [
    "info_colonnes_dataframe(df1_structure_tag_mappings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)\n",
    "\n",
    "print(df1_tags['name'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a59bf932-5b54-4600-81f5-c55ac93ae510",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_rows', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(df1_tags['name'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76bffba1-5f7e-4308-9224-437ca66148f8",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## KPI sur target_type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d91d5895",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c58b17d3",
   "metadata": {},
   "source": [
    "Raisonnement : on prends les target_type qui représente 90% des clients et on fait des catégories dessus."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d74426b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "targets = load_dataset_2(\"3\", \"targets\")\n",
    "target_types = load_dataset_2(\"3\", \"target_types\")\n",
    "\n",
    "# target_all = pd.merge(targets, target_types, left_on= 'target_type_id', right_on= 'id' ,how = 'inner')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6930bff5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_main_target(tenant_id, nb_print = 40):\n",
    "    df_target = display_databases(tenant_id, \"target_information\")\n",
    "\n",
    "    print('Nombre de ciblage : ', len(df_target))\n",
    "    nb_customers = df_target['customer_id'].nunique()\n",
    "    print('Nombre de client avec étiquette target : ', nb_customers) \n",
    "\n",
    "    nb_custumers_per_target = df_target.groupby(\"target_name\")['customer_id'].count().reset_index().sort_values('customer_id', ascending=False)\n",
    "    nb_custumers_per_target['cumulative_customers'] = nb_custumers_per_target['customer_id'].cumsum()/len(df_target)\n",
    "    nb_custumers_per_target['customer_id'] = nb_custumers_per_target['customer_id']/nb_customers\n",
    "\n",
    "    return nb_custumers_per_target.head(nb_print)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e7ee1a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "print_main_target('1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b57a28ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "print_main_target('2', 25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a65991f",
   "metadata": {},
   "outputs": [],
   "source": [
    "print_main_target('3', 40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c66a4dc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_rows', None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f34b8bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "print_main_target('4', 80)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40fe3676",
   "metadata": {},
   "outputs": [],
   "source": [
    "print_main_target('101', 100)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "605cced5-052f-4a99-ac26-020c5d2ab633",
   "metadata": {},
   "source": [
    "## KPI sur tags"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "916c3e2b-04d3-4877-b894-8f26f10d926e",
   "metadata": {},
   "outputs": [],
   "source": [
    "customersplus = load_dataset_2(\"4\", \"customersplus\")[['id', 'structure_id']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "46847b24-15a4-464e-969f-f16ed3653f1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "structure_tag_mappings = load_dataset_2('4', \"structure_tag_mappings\")[['structure_id', 'tag_id']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "3c10c69d-735f-453e-96bf-750697d965d0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "19427"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "customersplus[customersplus['structure_id'].notna()]['structure_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "9b0e77b3-5f16-4484-9564-7d3826583418",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "33645"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(customersplus[customersplus['structure_id'].notna()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "dfa27722-37f9-435a-8221-8aa6f9a4a107",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3431"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "structure_tag_mappings['structure_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "2daabdd5-31e3-4918-9856-9bbc30cde602",
   "metadata": {},
   "outputs": [],
   "source": [
    "def tags_information(tenant_id, first_tags):\n",
    "\n",
    "    customersplus = load_dataset_2(tenant_id, \"customersplus\")[['id', 'structure_id']]\n",
    "    customersplus.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
    "    tags = load_dataset_2(tenant_id, \"tags\")[['id', 'name']]\n",
    "    tags.rename(columns = {'id' : 'tag_id', 'name' : 'tag_name'}, inplace = True)\n",
    "    structure_tag_mappings = load_dataset_2(tenant_id, \"structure_tag_mappings\")[['structure_id', 'tag_id']]\n",
    "    \n",
    "    customer_tags = pd.merge(customersplus, structure_tag_mappings, on = 'structure_id', how = 'left')\n",
    "    customer_tags = pd.merge(customer_tags, tags, on = 'tag_id', how = 'inner')\n",
    "    \n",
    "    nb_customers_with_tag = customer_tags['customer_id'].nunique()\n",
    "    \n",
    "    print('Nombre de client avec tag : ', nb_customers_with_tag)\n",
    "    print('Proportion de clients avec tags : ', nb_customers_with_tag/len(customersplus))\n",
    "    print('Moyenne de tags par client : ', len(customer_tags)/nb_customers_with_tag)\n",
    "    \n",
    "    info = customer_tags.groupby(['tag_id', 'tag_name'])['customer_id'].count().reset_index().sort_values('customer_id', ascending = False).head(first_tags)\n",
    "\n",
    "    return info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "0b9f5f71-a927-4cc8-bb0c-9538e28d3553",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nombre de client avec tag :  13320\n",
      "Proportion de clients avec tags :  0.0877089012682233\n",
      "Moyenne de tags par client :  2.1725975975975977\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag_id</th>\n",
       "      <th>tag_name</th>\n",
       "      <th>customer_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11029.0</td>\n",
       "      <td>individuels</td>\n",
       "      <td>3270</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>11047.0</td>\n",
       "      <td>groupes scolaires</td>\n",
       "      <td>2417</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>11033.0</td>\n",
       "      <td>association</td>\n",
       "      <td>2308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>11028.0</td>\n",
       "      <td>structures culturelles</td>\n",
       "      <td>2011</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>11051.0</td>\n",
       "      <td>etablissement ens scolaire</td>\n",
       "      <td>1732</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>11036.0</td>\n",
       "      <td>champ social</td>\n",
       "      <td>1603</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>11072.0</td>\n",
       "      <td>etab d'enseignement</td>\n",
       "      <td>1036</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>11043.0</td>\n",
       "      <td>etablissement public</td>\n",
       "      <td>935</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>11035.0</td>\n",
       "      <td>organisme de tourisme</td>\n",
       "      <td>892</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>11045.0</td>\n",
       "      <td>centre de loisirs</td>\n",
       "      <td>864</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>11073.0</td>\n",
       "      <td>musée, site &amp; fondation</td>\n",
       "      <td>786</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>11053.0</td>\n",
       "      <td>groupes etudiants</td>\n",
       "      <td>758</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11032.0</td>\n",
       "      <td>entreprise</td>\n",
       "      <td>750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11039.0</td>\n",
       "      <td>etablissement d'enseignement</td>\n",
       "      <td>741</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>11034.0</td>\n",
       "      <td>asso. culturelle</td>\n",
       "      <td>692</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>11044.0</td>\n",
       "      <td>administration et collectivité</td>\n",
       "      <td>676</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>11046.0</td>\n",
       "      <td>tour opérateur</td>\n",
       "      <td>642</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>11048.0</td>\n",
       "      <td>entreprises</td>\n",
       "      <td>515</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>11619.0</td>\n",
       "      <td>structures culturelles;musée, site &amp; fondation</td>\n",
       "      <td>427</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>11037.0</td>\n",
       "      <td>handicap</td>\n",
       "      <td>426</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     tag_id                                        tag_name  customer_id\n",
       "1   11029.0                                     individuels         3270\n",
       "18  11047.0                               groupes scolaires         2417\n",
       "4   11033.0                                     association         2308\n",
       "0   11028.0                          structures culturelles         2011\n",
       "22  11051.0                      etablissement ens scolaire         1732\n",
       "7   11036.0                                    champ social         1603\n",
       "43  11072.0                             etab d'enseignement         1036\n",
       "14  11043.0                            etablissement public          935\n",
       "6   11035.0                           organisme de tourisme          892\n",
       "16  11045.0                               centre de loisirs          864\n",
       "44  11073.0                         musée, site & fondation          786\n",
       "24  11053.0                               groupes etudiants          758\n",
       "3   11032.0                                      entreprise          750\n",
       "10  11039.0                    etablissement d'enseignement          741\n",
       "5   11034.0                                asso. culturelle          692\n",
       "15  11044.0                  administration et collectivité          676\n",
       "17  11046.0                                  tour opérateur          642\n",
       "19  11048.0                                     entreprises          515\n",
       "72  11619.0  structures culturelles;musée, site & fondation          427\n",
       "8   11037.0                                        handicap          426"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags_information(\"1\", 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "bd5bef41-1774-4601-86b5-b7c1aea8f1d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nombre de client avec tag :  5953\n",
      "Proportion de clients avec tags :  0.021598421025897787\n",
      "Moyenne de tags par client :  1.0\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag_id</th>\n",
       "      <th>tag_name</th>\n",
       "      <th>customer_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>training-sb-ax</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   tag_id        tag_name  customer_id\n",
       "0     1.0  training-sb-ax            5"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags_information(\"2\", 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "7c2dc3e6-1418-44db-a8c0-4a9d59ec5232",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>training-sb-ax</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id            name\n",
       "0   1  training-sb-ax\n",
       "1   2             NaN"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "load_dataset_2(\"2\", \"tags\")[['id', 'name']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "c7b2c670-7122-4f67-b1aa-8c80a10f16d8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nombre de client avec tag :  23659\n",
      "Proportion de clients avec tags :  0.09207484608139978\n",
      "Moyenne de tags par client :  3.0620482691576143\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag_id</th>\n",
       "      <th>tag_name</th>\n",
       "      <th>customer_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>164</th>\n",
       "      <td>44539.0</td>\n",
       "      <td>*individuel/particulier</td>\n",
       "      <td>13148</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>26926.0</td>\n",
       "      <td>ce</td>\n",
       "      <td>3216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>6995.0</td>\n",
       "      <td>college</td>\n",
       "      <td>2126</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>7028.0</td>\n",
       "      <td>lycee</td>\n",
       "      <td>1577</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154</th>\n",
       "      <td>44524.0</td>\n",
       "      <td>iraiser</td>\n",
       "      <td>1453</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6714.0</td>\n",
       "      <td>ecole primaire</td>\n",
       "      <td>1200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155</th>\n",
       "      <td>44525.0</td>\n",
       "      <td>bp</td>\n",
       "      <td>1094</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>7024.0</td>\n",
       "      <td>centre de loisirs</td>\n",
       "      <td>1080</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>153</th>\n",
       "      <td>44515.0</td>\n",
       "      <td>entreprise</td>\n",
       "      <td>998</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>126</th>\n",
       "      <td>44039.0</td>\n",
       "      <td>ca fondation d'aumale</td>\n",
       "      <td>891</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152</th>\n",
       "      <td>44514.0</td>\n",
       "      <td>particulier</td>\n",
       "      <td>838</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>43663.0</td>\n",
       "      <td>président</td>\n",
       "      <td>816</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>43703.0</td>\n",
       "      <td>directeur</td>\n",
       "      <td>812</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>158</th>\n",
       "      <td>44528.0</td>\n",
       "      <td>dc</td>\n",
       "      <td>807</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>43681.0</td>\n",
       "      <td>présidente</td>\n",
       "      <td>805</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149</th>\n",
       "      <td>44511.0</td>\n",
       "      <td>entreprise (financier)</td>\n",
       "      <td>805</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90</th>\n",
       "      <td>43718.0</td>\n",
       "      <td>conseillère régionale déléguée titulaire</td>\n",
       "      <td>804</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>43667.0</td>\n",
       "      <td>directeur de l'agence</td>\n",
       "      <td>801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78</th>\n",
       "      <td>43705.0</td>\n",
       "      <td>sous-préfet</td>\n",
       "      <td>798</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>100</th>\n",
       "      <td>43728.0</td>\n",
       "      <td>chargée de mission paysage</td>\n",
       "      <td>797</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      tag_id                                  tag_name  customer_id\n",
       "164  44539.0                   *individuel/particulier        13148\n",
       "30   26926.0                                        ce         3216\n",
       "14    6995.0                                   college         2126\n",
       "16    7028.0                                     lycee         1577\n",
       "154  44524.0                                   iraiser         1453\n",
       "2     6714.0                            ecole primaire         1200\n",
       "155  44525.0                                        bp         1094\n",
       "15    7024.0                         centre de loisirs         1080\n",
       "153  44515.0                                entreprise          998\n",
       "126  44039.0                     ca fondation d'aumale          891\n",
       "152  44514.0                               particulier          838\n",
       "36   43663.0                                 président          816\n",
       "76   43703.0                                 directeur          812\n",
       "158  44528.0                                        dc          807\n",
       "54   43681.0                                présidente          805\n",
       "149  44511.0                    entreprise (financier)          805\n",
       "90   43718.0  conseillère régionale déléguée titulaire          804\n",
       "40   43667.0                     directeur de l'agence          801\n",
       "78   43705.0                               sous-préfet          798\n",
       "100  43728.0                chargée de mission paysage          797"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags_information(\"3\", 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "76639995-252d-4a58-83d8-c0c00900c3a9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nombre de client avec tag :  10495\n",
      "Proportion de clients avec tags :  0.03271416949025744\n",
      "Moyenne de tags par client :  5.298427822772749\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag_id</th>\n",
       "      <th>tag_name</th>\n",
       "      <th>customer_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>147</th>\n",
       "      <td>298.0</td>\n",
       "      <td>jhima</td>\n",
       "      <td>4219</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>297.0</td>\n",
       "      <td>colloque algérie</td>\n",
       "      <td>3851</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>292.0</td>\n",
       "      <td>i&amp;ma</td>\n",
       "      <td>3826</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154</th>\n",
       "      <td>305.0</td>\n",
       "      <td>mardis de la philo</td>\n",
       "      <td>3674</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>150</th>\n",
       "      <td>301.0</td>\n",
       "      <td>le grand continant</td>\n",
       "      <td>3670</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>295.0</td>\n",
       "      <td>araborama</td>\n",
       "      <td>3669</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>155</th>\n",
       "      <td>306.0</td>\n",
       "      <td>marie descourtieux</td>\n",
       "      <td>3669</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>296.0</td>\n",
       "      <td>c'était la guerre d'algérie</td>\n",
       "      <td>3669</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>291.0</td>\n",
       "      <td>araborama 3</td>\n",
       "      <td>3669</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>102</th>\n",
       "      <td>198.0</td>\n",
       "      <td>association de collectivités territoriales spé...</td>\n",
       "      <td>3669</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>294.0</td>\n",
       "      <td>arabofolies</td>\n",
       "      <td>3669</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>199.0</td>\n",
       "      <td>rassemble les 11 000 élus de toute la france a...</td>\n",
       "      <td>3669</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>50.0</td>\n",
       "      <td>association</td>\n",
       "      <td>463</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>54.0</td>\n",
       "      <td>collège</td>\n",
       "      <td>446</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>49.0</td>\n",
       "      <td>ecole</td>\n",
       "      <td>374</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>55.0</td>\n",
       "      <td>lycée</td>\n",
       "      <td>275</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>53.0</td>\n",
       "      <td>centre social</td>\n",
       "      <td>200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>130.0</td>\n",
       "      <td>cultures et arts</td>\n",
       "      <td>141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>51.0</td>\n",
       "      <td>mairie</td>\n",
       "      <td>136</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>64.0</td>\n",
       "      <td>formation_ima_ax</td>\n",
       "      <td>87</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     tag_id                                           tag_name  customer_id\n",
       "147   298.0                                              jhima         4219\n",
       "146   297.0                                   colloque algérie         3851\n",
       "142   292.0                                               i&ma         3826\n",
       "154   305.0                                 mardis de la philo         3674\n",
       "150   301.0                                 le grand continant         3670\n",
       "144   295.0                                          araborama         3669\n",
       "155   306.0                                 marie descourtieux         3669\n",
       "145   296.0                        c'était la guerre d'algérie         3669\n",
       "141   291.0                                        araborama 3         3669\n",
       "102   198.0  association de collectivités territoriales spé...         3669\n",
       "143   294.0                                        arabofolies         3669\n",
       "103   199.0  rassemble les 11 000 élus de toute la france a...         3669\n",
       "2      50.0                                        association          463\n",
       "6      54.0                                            collège          446\n",
       "1      49.0                                              ecole          374\n",
       "7      55.0                                              lycée          275\n",
       "5      53.0                                      centre social          200\n",
       "53    130.0                                   cultures et arts          141\n",
       "3      51.0                                             mairie          136\n",
       "13     64.0                                   formation_ima_ax           87"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags_information(\"4\", 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "07e91791-d4d4-42b1-ac18-22d3b0b9f7bd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Nombre de client avec tag :  532342\n",
      "Proportion de clients avec tags :  0.18660686931118298\n",
      "Moyenne de tags par client :  24.114082676174338\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tag_id</th>\n",
       "      <th>tag_name</th>\n",
       "      <th>customer_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>349.0</td>\n",
       "      <td>clients internet</td>\n",
       "      <td>517491</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>356.0</td>\n",
       "      <td>associations / clubs</td>\n",
       "      <td>495520</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>10.0</td>\n",
       "      <td>agence de voyages</td>\n",
       "      <td>493774</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>410.0</td>\n",
       "      <td>guides conférenciers</td>\n",
       "      <td>493378</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>360.0</td>\n",
       "      <td>groupe amis ou famille</td>\n",
       "      <td>493021</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>354.0</td>\n",
       "      <td>ce / entreprises</td>\n",
       "      <td>493016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>17.0</td>\n",
       "      <td>association/club</td>\n",
       "      <td>493008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3.0</td>\n",
       "      <td>c.e. / entreprise</td>\n",
       "      <td>492656</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>11.0</td>\n",
       "      <td>college</td>\n",
       "      <td>492552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>69.0</td>\n",
       "      <td>tour operator</td>\n",
       "      <td>492549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9.0</td>\n",
       "      <td>ecole primaire</td>\n",
       "      <td>492540</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>379.0</td>\n",
       "      <td>parent goûter anniversaire</td>\n",
       "      <td>492468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>364.0</td>\n",
       "      <td>institutions</td>\n",
       "      <td>492364</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6.0</td>\n",
       "      <td>institution</td>\n",
       "      <td>492321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>186.0</td>\n",
       "      <td>autocaristes</td>\n",
       "      <td>492153</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>13.0</td>\n",
       "      <td>enseignement superieur</td>\n",
       "      <td>492131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>359.0</td>\n",
       "      <td>hotels / campings</td>\n",
       "      <td>492078</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>7186.0</td>\n",
       "      <td>individuel</td>\n",
       "      <td>491913</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>7.0</td>\n",
       "      <td>groupe amis / famille</td>\n",
       "      <td>491900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2.0</td>\n",
       "      <td>client internet</td>\n",
       "      <td>491896</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    tag_id                    tag_name  customer_id\n",
       "20   349.0            clients internet       517491\n",
       "24   356.0        associations / clubs       495520\n",
       "5     10.0           agence de voyages       493774\n",
       "32   410.0        guides conférenciers       493378\n",
       "26   360.0      groupe amis ou famille       493021\n",
       "23   354.0            ce / entreprises       493016\n",
       "8     17.0            association/club       493008\n",
       "1      3.0           c.e. / entreprise       492656\n",
       "6     11.0                     college       492552\n",
       "13    69.0               tour operator       492549\n",
       "4      9.0              ecole primaire       492540\n",
       "31   379.0  parent goûter anniversaire       492468\n",
       "30   364.0                institutions       492364\n",
       "2      6.0                 institution       492321\n",
       "18   186.0                autocaristes       492153\n",
       "7     13.0      enseignement superieur       492131\n",
       "25   359.0           hotels / campings       492078\n",
       "42  7186.0                  individuel       491913\n",
       "3      7.0       groupe amis / famille       491900\n",
       "0      2.0             client internet       491896"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tags_information(\"101\", 20)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901",
   "metadata": {},
   "source": [
    "# Fin travail 25/02"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c437eaec",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "# Exemple sur Company 1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1c1fc39",
   "metadata": {},
   "source": [
    "## Chargement données"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "66f8c17b",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUCKET = \"bdc2324-data/1\"\n",
    "liste_database = fs.ls(BUCKET)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c08e6798",
   "metadata": {},
   "outputs": [],
   "source": [
    "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
    "\n",
    "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
    "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
    "\n",
    "# Afficher le résultat\n",
    "print(liste_database_filtered)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "675f518d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<<<<<<< local <modified: >\n",
      "/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(file_in)\n",
      "=======\n",
      "/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(file_in)\n",
      ">>>>>>> remote <modified: >\n"
     ]
    }
   ],
   "source": [
    "# loop to create dataframes from liste\n",
    "files_path = liste_database\n",
    "\n",
    "client_number = files_path[0].split(\"/\")[1]\n",
    "df_prefix = \"df\" + str(client_number) + \"_\"\n",
    "\n",
    "for i in range(len(files_path)) :\n",
    "    current_path = files_path[i]\n",
    "    with fs.open(current_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in)\n",
    "        # the pattern of the name is df1xxx\n",
    "        nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
    "        globals()[nom_dataframe] = df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e855f403",
   "metadata": {},
   "source": [
    "## customersplus.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "91a8f8c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = pd.DataFrame(df1_customersplus.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "2fda171d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def info_colonnes_dataframe(df):\n",
    "    # Créer une liste pour stocker les informations sur chaque colonne\n",
    "    infos_colonnes = []\n",
    "\n",
    "    # Parcourir les colonnes du DataFrame\n",
    "    for nom_colonne, serie in df.items():  # Utiliser items() au lieu de iteritems()\n",
    "        # Calculer le taux de valeurs manquantes\n",
    "        taux_na = serie.isna().mean() * 100\n",
    "\n",
    "        # Ajouter les informations à la liste\n",
    "        infos_colonnes.append({\n",
    "            'Nom_colonne': nom_colonne,\n",
    "            'Type_colonne': str(serie.dtype),\n",
    "            'Taux_NA': taux_na\n",
    "        })\n",
    "\n",
    "    # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
    "    df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
    "\n",
    "    return df_infos_colonnes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "205eeeab",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleaning_date(df, column_name):\n",
    "    \"\"\"\n",
    "    Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
    "\n",
    "    Parameters:\n",
    "    - df: DataFrame\n",
    "        Le DataFrame contenant la colonne à nettoyer.\n",
    "    - column_name: str\n",
    "        Le nom de la colonne à nettoyer.\n",
    "\n",
    "    Returns:\n",
    "    - DataFrame\n",
    "        Le DataFrame modifié avec la colonne nettoyée.\n",
    "    \"\"\"\n",
    "    df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "634282c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = info_colonnes_dataframe(df1_customersplus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "0e8d4133",
   "metadata": {},
   "outputs": [],
   "source": [
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1268ad5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "bd41dc80",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_customersplus_clean = df1_customersplus.copy()\n",
    "\n",
    "cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
    "cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
    "\n",
    "df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
    "df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64d0f76b",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## tickets.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7e683711",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_tickets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e7b9a52e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_tickets.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "568280e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_tickets.isna().sum()/len(df1_tickets)*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "29ecec90",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
    "df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22bb5de4",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## suppliers.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6a9a91f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "bab4758a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b5fff251",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers.isna().sum()/len(df1_suppliers)*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8b09e2a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
    "df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ecee7cdc",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers_clean"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c8e6e69b",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## type_ofs.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "1a6cff1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_type_ofs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "93630b41",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_type_ofs.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4f94481a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
    "df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b2811e2",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## purchases.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2455d2e1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df1_purchases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5f9a159d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_purchases.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "db201bf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Nettoyage purchase_date\n",
    "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
    "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "bd436fca",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_purchases.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "83435862",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f210e730",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Fusion de l'ensemble des données billétiques"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "1f8b3aa7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fusion avec fournisseurs\n",
    "df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
    "df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
    "\n",
    "# Fusion avec type de tickets\n",
    "df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
    "df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
    "\n",
    "# Fusion avec achats\n",
    "df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
    "df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "83a4d021",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df1_ticket_information"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56e6ebd1",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "# Utilisation de fonctions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "88fcde4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Créer un DataFrame exemple\n",
    "df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
    "\n",
    "# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
    "df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
    "df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
    "\n",
    "test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
    "\n",
    "test.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "818f69db",
   "metadata": {},
   "source": [
    "## Nettoyage, selection et fusion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "c9654eda",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_ticket_information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "7f2b620c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_ticket_information.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "637bdb72",
   "metadata": {},
   "source": [
    "# Customer information"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14c52894",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Target area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d83abfbf",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
     ]
    }
   ],
   "source": [
    "# Target.csv cleaning\n",
    "df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
    "df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
    "\n",
    "# target_type cleaning\n",
    "df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
    "\n",
    "#customer_target_mappings cleaning\n",
    "df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
    "\n",
    "# Merge target et target_type\n",
    "df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
    "df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
    "\n",
    "# Merge\n",
    "df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
    "df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "90d71b2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
    "len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
    "\n",
    "# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
    "df1_targets_test.mean()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "2301de1e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>customer_id</th>\n",
       "      <th>target_name</th>\n",
       "      <th>target_type_is_import</th>\n",
       "      <th>target_type_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1184824</td>\n",
       "      <td>645400</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>210571</td>\n",
       "      <td>2412</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>210572</td>\n",
       "      <td>4536</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>210573</td>\n",
       "      <td>6736</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>210574</td>\n",
       "      <td>38210</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id  customer_id               target_name  target_type_is_import  \\\n",
       "0  1184824       645400  DDCP PROMO Réseau livres                  False   \n",
       "1   210571         2412  DDCP PROMO Réseau livres                  False   \n",
       "2   210572         4536  DDCP PROMO Réseau livres                  False   \n",
       "3   210573         6736  DDCP PROMO Réseau livres                  False   \n",
       "4   210574        38210  DDCP PROMO Réseau livres                  False   \n",
       "\n",
       "       target_type_name  \n",
       "0  manual_static_filter  \n",
       "1  manual_static_filter  \n",
       "2  manual_static_filter  \n",
       "3  manual_static_filter  \n",
       "4  manual_static_filter  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_targets_full.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "75fbc2f7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Catégorisation des target_name\n",
    "import pandas as pd\n",
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from nltk.probability import FreqDist\n",
    "\n",
    "# Téléchargement des ressources nécessaires\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "nltk.download('wordnet')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "55cddf92",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mots les plus fréquents:\n",
      "consentement: 550777\n",
      "optin: 463579\n",
      "jeune: 155103\n",
      "public: 155103\n",
      "mediation: 150001\n"
     ]
    }
   ],
   "source": [
    "# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
    "def preprocess_text(texte):\n",
    "    # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
    "    texte_concat = ' '.join(texte)\n",
    "    \n",
    "    # Tokenisation des mots\n",
    "    tokens = word_tokenize(texte_concat.lower())\n",
    "    \n",
    "    # Suppression des mots vides (stopwords)\n",
    "    stop_words = set(stopwords.words('french'))\n",
    "    filtered_tokens = [word for word in tokens if word not in stop_words]\n",
    "    \n",
    "    # Lemmatisation des mots\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
    "    \n",
    "    return lemmatized_tokens\n",
    "\n",
    "\n",
    "# Appliquer le prétraitement à la colonne de texte\n",
    "df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
    "\n",
    "# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
    "all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
    "\n",
    "# Calculer la fréquence des mots\n",
    "freq_dist = FreqDist(all_words)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "7fd98a85",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mots les plus fréquents:\n",
      "consentement: 550777\n",
      "optin: 463579\n",
      "jeune: 155103\n",
      "public: 155103\n",
      "mediation: 150001\n",
      "specialisee: 150001\n",
      "b2c: 143432\n",
      "optout: 97683\n",
      "newsletter: 56022\n",
      "(: 46084\n",
      "): 46084\n",
      "inscrits: 42296\n",
      "nl: 42294\n",
      "générale: 41037\n",
      "generale: 40950\n"
     ]
    }
   ],
   "source": [
    "# Affichage des mots les plus fréquents\n",
    "print(\"Mots les plus fréquents:\")\n",
    "for mot, freq in freq_dist.most_common(15):\n",
    "    print(f\"{mot}: {freq}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "cf94bb1d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            texte  \\\n",
      "0  Le chat noir mange une souris.   \n",
      "1           Le chien blanc aboie.   \n",
      "\n",
      "                                 texte_preprocessed  \n",
      "0  [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .]  \n",
      "1              [e, h, i, e, b, a, a, b, o, i, e, .]  \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "\n",
    "# Téléchargement des ressources nécessaires\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "nltk.download('wordnet')\n",
    "\n",
    "# Création de la DataFrame d'exemple\n",
    "data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
    "df = pd.DataFrame(data)\n",
    "\n",
    "# Fonction pour prétraiter le texte\n",
    "def preprocess_text(texte):\n",
    "    # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
    "    texte_concat = ' '.join(texte)\n",
    "    \n",
    "    # Tokenisation des mots\n",
    "    tokens = word_tokenize(texte_concat.lower())\n",
    "    \n",
    "    # Suppression des mots vides (stopwords)\n",
    "    stop_words = set(stopwords.words('french'))\n",
    "    filtered_tokens = [word for word in tokens if word not in stop_words]\n",
    "    \n",
    "    # Lemmatisation des mots\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
    "    \n",
    "    return lemmatized_tokens\n",
    "\n",
    "# Appliquer la fonction de prétraitement à la colonne de texte\n",
    "df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
    "\n",
    "# Afficher le résultat\n",
    "print(df)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "711d3884",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Campaign area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "c25b5295",
   "metadata": {},
   "outputs": [],
   "source": [
    "# campaign_stats cleaning \n",
    "df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
    "cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
    "cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
    "cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
    "\n",
    "# campaigns cleaning\n",
    "df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
    "cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
    "\n",
    "# Merge \n",
    "df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
    "df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "2a3de6a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_campaigns_full.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "3fc1f446",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_campaigns_information"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20e69ee3",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Link area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "d9cbdbce",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_campaigns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c07459f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_link_stats"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "80ae4c42",
   "metadata": {},
   "source": [
    "## Exploration variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b50b8f95",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
    "def suppliers_exploration(suppliers = None) : \n",
    "    \n",
    "    # Taux de NaN pour ces colonnes\n",
    "    label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
    "    itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
    "    commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
    "\n",
    "    suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
    "                                  'label_na' : [label_na],\n",
    "                                  'itr_na' : [itr_na],\n",
    "                                  'commission_na' : [commission_na]})\n",
    "\n",
    "    return suppliers_desc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7e292935",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "05b6f2b0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nb_suppliers</th>\n",
       "      <th>label_na</th>\n",
       "      <th>itr_na</th>\n",
       "      <th>commission_na</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   nb_suppliers  label_na  itr_na  commission_na\n",
       "0             9     100.0   100.0          100.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_suppliers_desc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c9324d80",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUCKET = \"bdc2324-data\"\n",
    "liste_folders = fs.ls(BUCKET)\n",
    "\n",
    "liste_files = []\n",
    "for company_folder in liste_folders : \n",
    "    liste_files.extend(fs.ls(company_folder))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "10304058",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
     ]
    }
   ],
   "source": [
    "liste_database_select = ['suppliers']\n",
    "\n",
    "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
    "liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
    "\n",
    "# Afficher le résultat\n",
    "print(liste_suppliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "ffa423e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# loop to create dataframes from file 2\n",
    "def database_loading(database_name = None):\n",
    "    files_path = database_name\n",
    "    \n",
    "    client_number = files_path.split(\"/\")[1]\n",
    "    df_prefix = \"df\" + str(client_number) + \"_\"\n",
    "    \n",
    "    current_path = files_path\n",
    "    with fs.open(current_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in)\n",
    "\n",
    "    return df, client_number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70bdc88d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "6a0f567d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all = pd.DataFrame()\n",
    "\n",
    "for link in liste_suppliers:\n",
    "    \n",
    "    df_supplier, tenant_id = database_loading(link)\n",
    "    \n",
    "    df_supplier['tenant_id'] = int(tenant_id)\n",
    "\n",
    "    df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "1522d8cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_all[df_all['tenant_id'] == 101]['name'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "b0e42a61",
   "metadata": {},
   "outputs": [],
   "source": [
    "liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
    "# vad = vente à distance\n",
    "df_all['name'] = df_all['name'].fillna('')\n",
    "\n",
    "df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "d299ae91",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tenant_id\n",
       "1      1\n",
       "2      1\n",
       "3      1\n",
       "4      1\n",
       "5      1\n",
       "6      1\n",
       "7      1\n",
       "8      1\n",
       "9      1\n",
       "10     1\n",
       "11     1\n",
       "12     1\n",
       "13     1\n",
       "14     1\n",
       "101    1\n",
       "Name: canal_vente_internet, dtype: int64"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_all.groupby('tenant_id')['canal_vente_internet'].max()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}