BDC-team-1/Exploration_billet_AJ.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5bf5c226",
   "metadata": {},
   "source": [
    "# Business Data Challenge - Team 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b1a5b9d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import s3fs\n",
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ecfa2219",
   "metadata": {},
   "source": [
    "Configuration de l'accès aux données"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "1a094277",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0294ce71-840e-458b-8ffa-cadabbc6da21",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "# Debut Travail 25/02"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "30d77451-2df6-4c07-8b15-66e0e990ff03",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create filesystem object\n",
    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
    "\n",
    "\n",
    "# Import cleaning and merge functions\n",
    "exec(open('0_KPI_functions.py').read())\n",
    "\n",
    "# Ignore warning\n",
    "warnings.filterwarnings('ignore')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_dataset_2(directory_path, file_name):\n",
    "    \"\"\"\n",
    "    This function loads csv file\n",
    "    \"\"\"\n",
    "    file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in, sep=\",\")\n",
    "\n",
    "    # drop na :\n",
    "    #df = df.dropna(axis=1, thresh=len(df))\n",
    "    # if identifier in table : delete it\n",
    "    if 'identifier' in df.columns:\n",
    "        df = df.drop(columns = 'identifier')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "35da2e15-1e23-4653-a214-c6ff8f186e85",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_5/customerplus_cleaned.csv\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customer_id</th>\n",
       "      <th>street_id</th>\n",
       "      <th>structure_id</th>\n",
       "      <th>mcp_contact_id</th>\n",
       "      <th>fidelity</th>\n",
       "      <th>tenant_id</th>\n",
       "      <th>is_partner</th>\n",
       "      <th>deleted_at</th>\n",
       "      <th>gender</th>\n",
       "      <th>is_email_true</th>\n",
       "      <th>...</th>\n",
       "      <th>max_price</th>\n",
       "      <th>ticket_sum</th>\n",
       "      <th>average_price</th>\n",
       "      <th>average_purchase_delay</th>\n",
       "      <th>average_price_basket</th>\n",
       "      <th>average_ticket_basket</th>\n",
       "      <th>total_price</th>\n",
       "      <th>purchase_count</th>\n",
       "      <th>first_buying_date</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6009745</td>\n",
       "      <td>1372685</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>af</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>6011228</td>\n",
       "      <td>1372685</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>af</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6058950</td>\n",
       "      <td>1372685</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>af</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6062404</td>\n",
       "      <td>1372685</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>af</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>250217</td>\n",
       "      <td>78785</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11035.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>fr</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>471593</th>\n",
       "      <td>4976621</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4732462.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>471594</th>\n",
       "      <td>4976636</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4731717.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>471595</th>\n",
       "      <td>4976637</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4731674.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>471596</th>\n",
       "      <td>4976645</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4731549.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>471597</th>\n",
       "      <td>4976666</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4731118.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1771</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>True</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>471598 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        customer_id  street_id  structure_id  mcp_contact_id  fidelity  \\\n",
       "0           6009745    1372685           NaN             NaN         0   \n",
       "1           6011228    1372685           NaN             NaN         0   \n",
       "2           6058950    1372685           NaN             NaN         0   \n",
       "3           6062404    1372685           NaN             NaN         0   \n",
       "4            250217      78785           NaN         11035.0         0   \n",
       "...             ...        ...           ...             ...       ...   \n",
       "471593      4976621          3           NaN       4732462.0         0   \n",
       "471594      4976636          3           NaN       4731717.0         0   \n",
       "471595      4976637          3           NaN       4731674.0         0   \n",
       "471596      4976645          3           NaN       4731549.0         0   \n",
       "471597      4976666          3           NaN       4731118.0         0   \n",
       "\n",
       "        tenant_id  is_partner  deleted_at  gender  is_email_true  ...  \\\n",
       "0            1771       False         NaN       2           True  ...   \n",
       "1            1771       False         NaN       2           True  ...   \n",
       "2            1771       False         NaN       2           True  ...   \n",
       "3            1771       False         NaN       2           True  ...   \n",
       "4            1771       False         NaN       0           True  ...   \n",
       "...           ...         ...         ...     ...            ...  ...   \n",
       "471593       1771       False         NaN       0           True  ...   \n",
       "471594       1771       False         NaN       2           True  ...   \n",
       "471595       1771       False         NaN       0           True  ...   \n",
       "471596       1771       False         NaN       2           True  ...   \n",
       "471597       1771       False         NaN       0           True  ...   \n",
       "\n",
       "        max_price ticket_sum  average_price  average_purchase_delay  \\\n",
       "0             NaN          0            NaN                     NaN   \n",
       "1             NaN          0            NaN                     NaN   \n",
       "2             NaN          0            NaN                     NaN   \n",
       "3             NaN          0            NaN                     NaN   \n",
       "4             NaN          0            0.0                     NaN   \n",
       "...           ...        ...            ...                     ...   \n",
       "471593        NaN          0            NaN                     NaN   \n",
       "471594        NaN          0            NaN                     NaN   \n",
       "471595        NaN          0            NaN                     NaN   \n",
       "471596        NaN          0            NaN                     NaN   \n",
       "471597        NaN          0            NaN                     NaN   \n",
       "\n",
       "        average_price_basket  average_ticket_basket  total_price  \\\n",
       "0                        NaN                    NaN          0.0   \n",
       "1                        NaN                    NaN          0.0   \n",
       "2                        NaN                    NaN          0.0   \n",
       "3                        NaN                    NaN          0.0   \n",
       "4                        NaN                    NaN          NaN   \n",
       "...                      ...                    ...          ...   \n",
       "471593                   NaN                    NaN          0.0   \n",
       "471594                   NaN                    NaN          0.0   \n",
       "471595                   NaN                    NaN          0.0   \n",
       "471596                   NaN                    NaN          0.0   \n",
       "471597                   NaN                    NaN          0.0   \n",
       "\n",
       "        purchase_count  first_buying_date  country  \n",
       "0                    0                NaN       af  \n",
       "1                    0                NaN       af  \n",
       "2                    0                NaN       af  \n",
       "3                    0                NaN       af  \n",
       "4                    0                NaN       fr  \n",
       "...                ...                ...      ...  \n",
       "471593               0                NaN      NaN  \n",
       "471594               0                NaN      NaN  \n",
       "471595               0                NaN      NaN  \n",
       "471596               0                NaN      NaN  \n",
       "471597               0                NaN      NaN  \n",
       "\n",
       "[471598 rows x 22 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "display_databases(\"5\", \"customerplus_cleaned\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "6c8ad8c3-25df-4fe4-9ad0-ee5f9498bc14",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>code</th>\n",
       "      <th>created_at</th>\n",
       "      <th>updated_at</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>101</td>\n",
       "      <td>hongrie</td>\n",
       "      <td>hu</td>\n",
       "      <td>2023-06-13 11:17:40.600622+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.600622+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>albanie</td>\n",
       "      <td>al</td>\n",
       "      <td>2023-06-13 11:17:40.540652+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.540652+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>antarctique</td>\n",
       "      <td>aq</td>\n",
       "      <td>2023-06-13 11:17:40.541315+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.541315+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12</td>\n",
       "      <td>autriche</td>\n",
       "      <td>at</td>\n",
       "      <td>2023-06-13 11:17:40.546711+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.546711+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>samoa américaines</td>\n",
       "      <td>as</td>\n",
       "      <td>2023-06-13 11:17:40.542569+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.542569+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>238</th>\n",
       "      <td>228</td>\n",
       "      <td>royaume-uni</td>\n",
       "      <td>gb</td>\n",
       "      <td>2023-06-13 11:17:40.678023+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.678023+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>239</th>\n",
       "      <td>25</td>\n",
       "      <td>brésil</td>\n",
       "      <td>br</td>\n",
       "      <td>2023-06-13 11:17:40.554209+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.554209+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>240</th>\n",
       "      <td>10</td>\n",
       "      <td>argentine</td>\n",
       "      <td>ar</td>\n",
       "      <td>2023-06-13 11:17:40.545489+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.545489+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>241</th>\n",
       "      <td>203</td>\n",
       "      <td>espagne</td>\n",
       "      <td>es</td>\n",
       "      <td>2023-06-13 11:17:40.662472+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.662472+02:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>242</th>\n",
       "      <td>192</td>\n",
       "      <td>arabie saoudite</td>\n",
       "      <td>sa</td>\n",
       "      <td>2023-06-13 11:17:40.656154+02:00</td>\n",
       "      <td>2023-06-13 11:17:40.656154+02:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>243 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      id               name code                        created_at  \\\n",
       "0    101            hongrie   hu  2023-06-13 11:17:40.600622+02:00   \n",
       "1      2            albanie   al  2023-06-13 11:17:40.540652+02:00   \n",
       "2      3        antarctique   aq  2023-06-13 11:17:40.541315+02:00   \n",
       "3     12           autriche   at  2023-06-13 11:17:40.546711+02:00   \n",
       "4      5  samoa américaines   as  2023-06-13 11:17:40.542569+02:00   \n",
       "..   ...                ...  ...                               ...   \n",
       "238  228        royaume-uni   gb  2023-06-13 11:17:40.678023+02:00   \n",
       "239   25             brésil   br  2023-06-13 11:17:40.554209+02:00   \n",
       "240   10          argentine   ar  2023-06-13 11:17:40.545489+02:00   \n",
       "241  203            espagne   es  2023-06-13 11:17:40.662472+02:00   \n",
       "242  192    arabie saoudite   sa  2023-06-13 11:17:40.656154+02:00   \n",
       "\n",
       "                           updated_at  \n",
       "0    2023-06-13 11:17:40.600622+02:00  \n",
       "1    2023-06-13 11:17:40.540652+02:00  \n",
       "2    2023-06-13 11:17:40.541315+02:00  \n",
       "3    2023-06-13 11:17:40.546711+02:00  \n",
       "4    2023-06-13 11:17:40.542569+02:00  \n",
       "..                                ...  \n",
       "238  2023-06-13 11:17:40.678023+02:00  \n",
       "239  2023-06-13 11:17:40.554209+02:00  \n",
       "240  2023-06-13 11:17:40.545489+02:00  \n",
       "241  2023-06-13 11:17:40.662472+02:00  \n",
       "242  2023-06-13 11:17:40.656154+02:00  \n",
       "\n",
       "[243 rows x 5 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "load_dataset_2(\"7\", \"countries\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Base communes au types Musée"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384",
   "metadata": {},
   "outputs": [],
   "source": [
    "companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
    "            'sport': ['5', '6', '7', '8', '9'],\n",
    "            'musique' : ['10', '11', '12', '13', '14']}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b",
   "metadata": {},
   "outputs": [],
   "source": [
    "companies['musee']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5080f66e-f779-410a-876d-b4fe2795e17e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in companies['musique']:\n",
    "    BUCKET = \"bdc2324-data/\"+i\n",
    "    liste_base = []\n",
    "    for base in fs.ls(BUCKET):\n",
    "        match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
    "        if match:\n",
    "            nom_base = match.group(3)\n",
    "            liste_base.append(nom_base)\n",
    "    globals()['base_'+i] = liste_base\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abd477e1-7479-4c88-a5aa-f987af3f5b79",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Trouver l'intersection entre les cinq listes\n",
    "intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n",
    "\n",
    "# Convertir le résultat en liste si nécessaire\n",
    "intersection_liste = list(intersection)\n",
    "\n",
    "print(intersection_liste)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Trouver l'intersection entre les cinq listes\n",
    "intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n",
    "\n",
    "# Convertir le résultat en liste si nécessaire\n",
    "intersection_liste = list(intersection)\n",
    "\n",
    "print(intersection_liste)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(intersection_liste)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0aa8976-1487-4ef5-898e-0d6a88183e67",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67f02868-b16a-41d5-a0f9-b31ce09278db",
   "metadata": {},
   "outputs": [],
   "source": [
    "base_101"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d058b21-a538-4f59-aefb-ef7966f73fdc",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_tags = load_dataset_2(\"1\", \"tags\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aa441f99-733c-4675-8676-bed4682d3324",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6767a750-14a4-4c05-903e-d2f07170825b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_customersplus = load_dataset_2(\"1\", \"customersplus\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "125e9145-a815-46fd-bdf4-07589508b259",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c17a6976-792f-474d-bcff-c89396eddb3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecfc155a-cb42-46ec-8da5-33fdcd087355",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df1_structure_tag_mappings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "071410b8-950d-4fcc-b2b9-57415253c286",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "?np.sort_values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "def info_colonnes_dataframe(df):\n",
    "    # Créer une liste pour stocker les informations sur chaque colonne\n",
    "    infos_colonnes = []\n",
    "\n",
    "    # Parcourir les colonnes du DataFrame\n",
    "    for nom_colonne, serie in df.items():  # Utiliser items() au lieu de iteritems()\n",
    "        # Calculer le taux de valeurs manquantes\n",
    "        taux_na = serie.isna().mean() * 100\n",
    "\n",
    "        # Ajouter les informations à la liste\n",
    "        infos_colonnes.append({\n",
    "            'Nom_colonne': nom_colonne,\n",
    "            'Type_colonne': str(serie.dtype),\n",
    "            'Taux_NA': taux_na\n",
    "        })\n",
    "\n",
    "    # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
    "    df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
    "\n",
    "    return df_infos_colonnes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "info_colonnes_dataframe(df1_tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980",
   "metadata": {},
   "outputs": [],
   "source": [
    "info_colonnes_dataframe(df1_structure_tag_mappings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)\n",
    "\n",
    "print(df1_tags['name'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a59bf932-5b54-4600-81f5-c55ac93ae510",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_rows', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(df1_tags['name'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76bffba1-5f7e-4308-9224-437ca66148f8",
   "metadata": {},
   "source": [
    "## KPI sur target_type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "622752ed-b565-4188-86d6-38f1f333fcbe",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File path :  projet-bdc2324-team1/0_Input/Company_1/target_information.csv\n"
     ]
    },
    {
     "ename": "PermissionError",
     "evalue": "Forbidden",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mClientError\u001b[0m                               Traceback (most recent call last)",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:529\u001b[0m, in \u001b[0;36mS3FileSystem.info\u001b[0;34m(self, path, version_id, refresh)\u001b[0m\n\u001b[1;32m    528\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 529\u001b[0m     out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_s3\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43ms3\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhead_object\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mBucket\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbucket\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    530\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mKey\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mversion_id_kw\u001b[49m\u001b[43m(\u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreq_kw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    531\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m    532\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mETag\u001b[39m\u001b[38;5;124m'\u001b[39m: out[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mETag\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m    533\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mKey\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([bucket, key]),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    540\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mVersionId\u001b[39m\u001b[38;5;124m'\u001b[39m: out\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mVersionId\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m    541\u001b[0m     }\n",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:200\u001b[0m, in \u001b[0;36mS3FileSystem._call_s3\u001b[0;34m(self, method, *akwarglist, **kwargs)\u001b[0m\n\u001b[1;32m    198\u001b[0m additional_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_s3_method_kwargs(method, \u001b[38;5;241m*\u001b[39makwarglist,\n\u001b[1;32m    199\u001b[0m                                                \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43madditional_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/botocore/client.py:553\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.<locals>._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    552\u001b[0m \u001b[38;5;66;03m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_api_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43moperation_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/botocore/client.py:1009\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m   1008\u001b[0m     error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m-> 1009\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m   1010\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "\u001b[0;31mClientError\u001b[0m: An error occurred (403) when calling the HeadObject operation: Forbidden",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mPermissionError\u001b[0m                           Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[74], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdisplay_databases\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m1\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtarget_information\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m<string>:12\u001b[0m, in \u001b[0;36mdisplay_databases\u001b[0;34m(directory_path, file_name, datetime_col)\u001b[0m\n",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py:1295\u001b[0m, in \u001b[0;36mAbstractFileSystem.open\u001b[0;34m(self, path, mode, block_size, cache_options, compression, **kwargs)\u001b[0m\n\u001b[1;32m   1293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1294\u001b[0m     ac \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mautocommit\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intrans)\n\u001b[0;32m-> 1295\u001b[0m     f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1296\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1297\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1298\u001b[0m \u001b[43m        \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1299\u001b[0m \u001b[43m        \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mac\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1300\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1301\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1302\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1303\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m compression \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1304\u001b[0m         \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfsspec\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompression\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compr\n",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:375\u001b[0m, in \u001b[0;36mS3FileSystem._open\u001b[0;34m(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, requester_pays, **kwargs)\u001b[0m\n\u001b[1;32m    372\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m cache_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    373\u001b[0m     cache_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_cache_type\n\u001b[0;32m--> 375\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mS3File\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43macl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43macl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    376\u001b[0m \u001b[43m              \u001b[49m\u001b[43mversion_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mversion_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfill_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    377\u001b[0m \u001b[43m              \u001b[49m\u001b[43ms3_additional_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    378\u001b[0m \u001b[43m              \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequester_pays\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:1096\u001b[0m, in \u001b[0;36mS3File.__init__\u001b[0;34m(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays)\u001b[0m\n\u001b[1;32m   1094\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39ms3_additional_kwargs \u001b[38;5;241m=\u001b[39m s3_additional_kwargs \u001b[38;5;129;01mor\u001b[39;00m {}\n\u001b[1;32m   1095\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreq_kw \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRequestPayer\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrequester\u001b[39m\u001b[38;5;124m'\u001b[39m} \u001b[38;5;28;01mif\u001b[39;00m requester_pays \u001b[38;5;28;01melse\u001b[39;00m {}\n\u001b[0;32m-> 1096\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43ms3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1097\u001b[0m \u001b[43m                 \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1098\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39ms3 \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfs  \u001b[38;5;66;03m# compatibility\u001b[39;00m\n\u001b[1;32m   1099\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwritable():\n",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py:1651\u001b[0m, in \u001b[0;36mAbstractBufferedFile.__init__\u001b[0;34m(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)\u001b[0m\n\u001b[1;32m   1649\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize \u001b[38;5;241m=\u001b[39m size\n\u001b[1;32m   1650\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1651\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdetails\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msize\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1652\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache \u001b[38;5;241m=\u001b[39m caches[cache_type](\n\u001b[1;32m   1653\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblocksize, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fetch_range, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcache_options\n\u001b[1;32m   1654\u001b[0m     )\n\u001b[1;32m   1655\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py:1664\u001b[0m, in \u001b[0;36mAbstractBufferedFile.details\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1661\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m   1662\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdetails\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m   1663\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_details \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1664\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_details \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfo\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1665\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_details\n",
      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:548\u001b[0m, in \u001b[0;36mS3FileSystem.info\u001b[0;34m(self, path, version_id, refresh)\u001b[0m\n\u001b[1;32m    546\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m(S3FileSystem, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39minfo(path)\n\u001b[1;32m    547\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 548\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m ee\n\u001b[1;32m    549\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ParamValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    550\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFailed to head path \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m%\u001b[39m (path, e))\n",
      "\u001b[0;31mPermissionError\u001b[0m: Forbidden"
     ]
    }
   ],
   "source": [
    "display_databases('1', 'target_information')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901",
   "metadata": {},
   "source": [
    "# Fin travail 25/02"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c437eaec",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "# Exemple sur Company 1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1c1fc39",
   "metadata": {},
   "source": [
    "## Chargement données"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "66f8c17b",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUCKET = \"bdc2324-data/1\"\n",
    "liste_database = fs.ls(BUCKET)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "c08e6798",
   "metadata": {},
   "outputs": [],
   "source": [
    "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
    "\n",
    "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
    "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
    "\n",
    "# Afficher le résultat\n",
    "print(liste_database_filtered)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "675f518d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<<<<<<< local <modified: >\n",
      "/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(file_in)\n",
      "=======\n",
      "/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(file_in)\n",
      ">>>>>>> remote <modified: >\n"
     ]
    }
   ],
   "source": [
    "# loop to create dataframes from liste\n",
    "files_path = liste_database\n",
    "\n",
    "client_number = files_path[0].split(\"/\")[1]\n",
    "df_prefix = \"df\" + str(client_number) + \"_\"\n",
    "\n",
    "for i in range(len(files_path)) :\n",
    "    current_path = files_path[i]\n",
    "    with fs.open(current_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in)\n",
    "        # the pattern of the name is df1xxx\n",
    "        nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
    "        globals()[nom_dataframe] = df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e855f403",
   "metadata": {},
   "source": [
    "## customersplus.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "91a8f8c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = pd.DataFrame(df1_customersplus.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "2fda171d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def info_colonnes_dataframe(df):\n",
    "    # Créer une liste pour stocker les informations sur chaque colonne\n",
    "    infos_colonnes = []\n",
    "\n",
    "    # Parcourir les colonnes du DataFrame\n",
    "    for nom_colonne, serie in df.items():  # Utiliser items() au lieu de iteritems()\n",
    "        # Calculer le taux de valeurs manquantes\n",
    "        taux_na = serie.isna().mean() * 100\n",
    "\n",
    "        # Ajouter les informations à la liste\n",
    "        infos_colonnes.append({\n",
    "            'Nom_colonne': nom_colonne,\n",
    "            'Type_colonne': str(serie.dtype),\n",
    "            'Taux_NA': taux_na\n",
    "        })\n",
    "\n",
    "    # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
    "    df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
    "\n",
    "    return df_infos_colonnes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "205eeeab",
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleaning_date(df, column_name):\n",
    "    \"\"\"\n",
    "    Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
    "\n",
    "    Parameters:\n",
    "    - df: DataFrame\n",
    "        Le DataFrame contenant la colonne à nettoyer.\n",
    "    - column_name: str\n",
    "        Le nom de la colonne à nettoyer.\n",
    "\n",
    "    Returns:\n",
    "    - DataFrame\n",
    "        Le DataFrame modifié avec la colonne nettoyée.\n",
    "    \"\"\"\n",
    "    df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "634282c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = info_colonnes_dataframe(df1_customersplus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "0e8d4133",
   "metadata": {},
   "outputs": [],
   "source": [
    "a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1268ad5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "bd41dc80",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_customersplus_clean = df1_customersplus.copy()\n",
    "\n",
    "cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
    "cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
    "\n",
    "df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
    "df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "64d0f76b",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## tickets.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "7e683711",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_tickets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "e7b9a52e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_tickets.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "568280e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_tickets.isna().sum()/len(df1_tickets)*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "29ecec90",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
    "df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22bb5de4",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## suppliers.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6a9a91f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "bab4758a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b5fff251",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers.isna().sum()/len(df1_suppliers)*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "8b09e2a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
    "df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ecee7cdc",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers_clean"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c8e6e69b",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## type_ofs.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "1a6cff1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_type_ofs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "93630b41",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_type_ofs.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "4f94481a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
    "df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b2811e2",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## purchases.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2455d2e1",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df1_purchases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "5f9a159d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_purchases.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "db201bf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Nettoyage purchase_date\n",
    "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
    "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "bd436fca",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_purchases.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "83435862",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selection des variables\n",
    "df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f210e730",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Fusion de l'ensemble des données billétiques"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "1f8b3aa7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fusion avec fournisseurs\n",
    "df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
    "df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
    "\n",
    "# Fusion avec type de tickets\n",
    "df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
    "df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
    "\n",
    "# Fusion avec achats\n",
    "df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
    "df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "83a4d021",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df1_ticket_information"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56e6ebd1",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "# Utilisation de fonctions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "88fcde4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Créer un DataFrame exemple\n",
    "df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
    "\n",
    "# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
    "df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
    "df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
    "\n",
    "test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
    "\n",
    "test.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "818f69db",
   "metadata": {},
   "source": [
    "## Nettoyage, selection et fusion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "c9654eda",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_ticket_information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "7f2b620c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_ticket_information.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "637bdb72",
   "metadata": {},
   "source": [
    "# Customer information"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14c52894",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Target area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d83abfbf",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
     ]
    }
   ],
   "source": [
    "# Target.csv cleaning\n",
    "df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
    "df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
    "\n",
    "# target_type cleaning\n",
    "df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
    "\n",
    "#customer_target_mappings cleaning\n",
    "df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
    "\n",
    "# Merge target et target_type\n",
    "df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
    "df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
    "\n",
    "# Merge\n",
    "df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
    "df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "90d71b2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
    "len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
    "\n",
    "# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
    "df1_targets_test.mean()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "2301de1e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>customer_id</th>\n",
       "      <th>target_name</th>\n",
       "      <th>target_type_is_import</th>\n",
       "      <th>target_type_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1184824</td>\n",
       "      <td>645400</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>210571</td>\n",
       "      <td>2412</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>210572</td>\n",
       "      <td>4536</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>210573</td>\n",
       "      <td>6736</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>210574</td>\n",
       "      <td>38210</td>\n",
       "      <td>DDCP PROMO Réseau livres</td>\n",
       "      <td>False</td>\n",
       "      <td>manual_static_filter</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        id  customer_id               target_name  target_type_is_import  \\\n",
       "0  1184824       645400  DDCP PROMO Réseau livres                  False   \n",
       "1   210571         2412  DDCP PROMO Réseau livres                  False   \n",
       "2   210572         4536  DDCP PROMO Réseau livres                  False   \n",
       "3   210573         6736  DDCP PROMO Réseau livres                  False   \n",
       "4   210574        38210  DDCP PROMO Réseau livres                  False   \n",
       "\n",
       "       target_type_name  \n",
       "0  manual_static_filter  \n",
       "1  manual_static_filter  \n",
       "2  manual_static_filter  \n",
       "3  manual_static_filter  \n",
       "4  manual_static_filter  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_targets_full.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "75fbc2f7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Catégorisation des target_name\n",
    "import pandas as pd\n",
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from nltk.probability import FreqDist\n",
    "\n",
    "# Téléchargement des ressources nécessaires\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "nltk.download('wordnet')\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "55cddf92",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mots les plus fréquents:\n",
      "consentement: 550777\n",
      "optin: 463579\n",
      "jeune: 155103\n",
      "public: 155103\n",
      "mediation: 150001\n"
     ]
    }
   ],
   "source": [
    "# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
    "def preprocess_text(texte):\n",
    "    # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
    "    texte_concat = ' '.join(texte)\n",
    "    \n",
    "    # Tokenisation des mots\n",
    "    tokens = word_tokenize(texte_concat.lower())\n",
    "    \n",
    "    # Suppression des mots vides (stopwords)\n",
    "    stop_words = set(stopwords.words('french'))\n",
    "    filtered_tokens = [word for word in tokens if word not in stop_words]\n",
    "    \n",
    "    # Lemmatisation des mots\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
    "    \n",
    "    return lemmatized_tokens\n",
    "\n",
    "\n",
    "# Appliquer le prétraitement à la colonne de texte\n",
    "df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
    "\n",
    "# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
    "all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
    "\n",
    "# Calculer la fréquence des mots\n",
    "freq_dist = FreqDist(all_words)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "7fd98a85",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mots les plus fréquents:\n",
      "consentement: 550777\n",
      "optin: 463579\n",
      "jeune: 155103\n",
      "public: 155103\n",
      "mediation: 150001\n",
      "specialisee: 150001\n",
      "b2c: 143432\n",
      "optout: 97683\n",
      "newsletter: 56022\n",
      "(: 46084\n",
      "): 46084\n",
      "inscrits: 42296\n",
      "nl: 42294\n",
      "générale: 41037\n",
      "generale: 40950\n"
     ]
    }
   ],
   "source": [
    "# Affichage des mots les plus fréquents\n",
    "print(\"Mots les plus fréquents:\")\n",
    "for mot, freq in freq_dist.most_common(15):\n",
    "    print(f\"{mot}: {freq}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "cf94bb1d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                            texte  \\\n",
      "0  Le chat noir mange une souris.   \n",
      "1           Le chien blanc aboie.   \n",
      "\n",
      "                                 texte_preprocessed  \n",
      "0  [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .]  \n",
      "1              [e, h, i, e, b, a, a, b, o, i, e, .]  \n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "\n",
    "# Téléchargement des ressources nécessaires\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "nltk.download('wordnet')\n",
    "\n",
    "# Création de la DataFrame d'exemple\n",
    "data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
    "df = pd.DataFrame(data)\n",
    "\n",
    "# Fonction pour prétraiter le texte\n",
    "def preprocess_text(texte):\n",
    "    # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
    "    texte_concat = ' '.join(texte)\n",
    "    \n",
    "    # Tokenisation des mots\n",
    "    tokens = word_tokenize(texte_concat.lower())\n",
    "    \n",
    "    # Suppression des mots vides (stopwords)\n",
    "    stop_words = set(stopwords.words('french'))\n",
    "    filtered_tokens = [word for word in tokens if word not in stop_words]\n",
    "    \n",
    "    # Lemmatisation des mots\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
    "    \n",
    "    return lemmatized_tokens\n",
    "\n",
    "# Appliquer la fonction de prétraitement à la colonne de texte\n",
    "df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
    "\n",
    "# Afficher le résultat\n",
    "print(df)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "711d3884",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Campaign area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "c25b5295",
   "metadata": {},
   "outputs": [],
   "source": [
    "# campaign_stats cleaning \n",
    "df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
    "cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
    "cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
    "cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
    "\n",
    "# campaigns cleaning\n",
    "df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
    "cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
    "\n",
    "# Merge \n",
    "df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
    "df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "2a3de6a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_campaigns_full.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "3fc1f446",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_campaigns_information"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20e69ee3",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true
   },
   "source": [
    "## Link area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "d9cbdbce",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_campaigns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c07459f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_link_stats"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "80ae4c42",
   "metadata": {},
   "source": [
    "## Exploration variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b50b8f95",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
    "def suppliers_exploration(suppliers = None) : \n",
    "    \n",
    "    # Taux de NaN pour ces colonnes\n",
    "    label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
    "    itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
    "    commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
    "\n",
    "    suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
    "                                  'label_na' : [label_na],\n",
    "                                  'itr_na' : [itr_na],\n",
    "                                  'commission_na' : [commission_na]})\n",
    "\n",
    "    return suppliers_desc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7e292935",
   "metadata": {},
   "outputs": [],
   "source": [
    "df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "05b6f2b0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nb_suppliers</th>\n",
       "      <th>label_na</th>\n",
       "      <th>itr_na</th>\n",
       "      <th>commission_na</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>9</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   nb_suppliers  label_na  itr_na  commission_na\n",
       "0             9     100.0   100.0          100.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1_suppliers_desc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c9324d80",
   "metadata": {},
   "outputs": [],
   "source": [
    "BUCKET = \"bdc2324-data\"\n",
    "liste_folders = fs.ls(BUCKET)\n",
    "\n",
    "liste_files = []\n",
    "for company_folder in liste_folders : \n",
    "    liste_files.extend(fs.ls(company_folder))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "10304058",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
     ]
    }
   ],
   "source": [
    "liste_database_select = ['suppliers']\n",
    "\n",
    "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
    "liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
    "\n",
    "# Afficher le résultat\n",
    "print(liste_suppliers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "ffa423e5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# loop to create dataframes from file 2\n",
    "def database_loading(database_name = None):\n",
    "    files_path = database_name\n",
    "    \n",
    "    client_number = files_path.split(\"/\")[1]\n",
    "    df_prefix = \"df\" + str(client_number) + \"_\"\n",
    "    \n",
    "    current_path = files_path\n",
    "    with fs.open(current_path, mode=\"rb\") as file_in:\n",
    "        df = pd.read_csv(file_in)\n",
    "\n",
    "    return df, client_number"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70bdc88d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "6a0f567d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all = pd.DataFrame()\n",
    "\n",
    "for link in liste_suppliers:\n",
    "    \n",
    "    df_supplier, tenant_id = database_loading(link)\n",
    "    \n",
    "    df_supplier['tenant_id'] = int(tenant_id)\n",
    "\n",
    "    df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "1522d8cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_all[df_all['tenant_id'] == 101]['name'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "b0e42a61",
   "metadata": {},
   "outputs": [],
   "source": [
    "liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
    "# vad = vente à distance\n",
    "df_all['name'] = df_all['name'].fillna('')\n",
    "\n",
    "df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "d299ae91",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tenant_id\n",
       "1      1\n",
       "2      1\n",
       "3      1\n",
       "4      1\n",
       "5      1\n",
       "6      1\n",
       "7      1\n",
       "8      1\n",
       "9      1\n",
       "10     1\n",
       "11     1\n",
       "12     1\n",
       "13     1\n",
       "14     1\n",
       "101    1\n",
       "Name: canal_vente_internet, dtype: int64"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_all.groupby('tenant_id')['canal_vente_internet'].max()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								{
 								 "cells": [
 								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "5bf5c226",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
 								   "source": [
 								    "# Business Data Challenge - Team 1"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Renommer notebook

											
										
										
											2024-01-13 14:14:11 +01:00
+								   "execution_count": 1,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "b1a5b9d3",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "import pandas as pd\n",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								    "import numpy as np\n",
 								    "import os\n",
 								    "import s3fs\n",
 								    "import re"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "ecfa2219",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
 								   "source": [
 								    "Configuration de l'accès aux données"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Renommer notebook

											
										
										
											2024-01-13 14:14:11 +01:00
+								   "execution_count": 2,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "1a094277",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "# Create filesystem object\n",
 								    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Exploration tags et target

											
										
										
											2024-02-25 18:33:24 +01:00
+								   "id": "0294ce71-840e-458b-8ffa-cadabbc6da21",
 								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
 								   "source": [
 								    "# Debut Travail 25/02"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 11,
 								   "id": "30d77451-2df6-4c07-8b15-66e0e990ff03",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "# Create filesystem object\n",
 								    "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
 								    "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
 								    "\n",
 								    "\n",
 								    "# Import cleaning and merge functions\n",
 								    "exec(open('0_KPI_functions.py').read())\n",
 								    "\n",
 								    "# Ignore warning\n",
 								    "warnings.filterwarnings('ignore')\n"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 12,
 								   "id": "f1b44d3e-76bb-4860-b9db-a2840db7cf39",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "def load_dataset_2(directory_path, file_name):\n",
 								    "    \"\"\"\n",
 								    "    This function loads csv file\n",
 								    "    \"\"\"\n",
 								    "    file_path = \"bdc2324-data\" + \"/\" + directory_path + \"/\" + directory_path + file_name + \".csv\"\n",
 								    "    with fs.open(file_path, mode=\"rb\") as file_in:\n",
 								    "        df = pd.read_csv(file_in, sep=\",\")\n",
 								    "\n",
 								    "    # drop na :\n",
 								    "    #df = df.dropna(axis=1, thresh=len(df))\n",
 								    "    # if identifier in table : delete it\n",
 								    "    if 'identifier' in df.columns:\n",
 								    "        df = df.drop(columns = 'identifier')\n",
 								    "    return df"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 13,
 								   "id": "35da2e15-1e23-4653-a214-c6ff8f186e85",
 								   "metadata": {},
 								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "File path :  projet-bdc2324-team1/0_Input/Company_5/customerplus_cleaned.csv\n"
 								     ]
 								    },
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>customer_id</th>\n",
 								       "      <th>street_id</th>\n",
 								       "      <th>structure_id</th>\n",
 								       "      <th>mcp_contact_id</th>\n",
 								       "      <th>fidelity</th>\n",
 								       "      <th>tenant_id</th>\n",
 								       "      <th>is_partner</th>\n",
 								       "      <th>deleted_at</th>\n",
 								       "      <th>gender</th>\n",
 								       "      <th>is_email_true</th>\n",
 								       "      <th>...</th>\n",
 								       "      <th>max_price</th>\n",
 								       "      <th>ticket_sum</th>\n",
 								       "      <th>average_price</th>\n",
 								       "      <th>average_purchase_delay</th>\n",
 								       "      <th>average_price_basket</th>\n",
 								       "      <th>average_ticket_basket</th>\n",
 								       "      <th>total_price</th>\n",
 								       "      <th>purchase_count</th>\n",
 								       "      <th>first_buying_date</th>\n",
 								       "      <th>country</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>0</th>\n",
 								       "      <td>6009745</td>\n",
 								       "      <td>1372685</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>2</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>af</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>1</th>\n",
 								       "      <td>6011228</td>\n",
 								       "      <td>1372685</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>2</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>af</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>2</th>\n",
 								       "      <td>6058950</td>\n",
 								       "      <td>1372685</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>2</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>af</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>3</th>\n",
 								       "      <td>6062404</td>\n",
 								       "      <td>1372685</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>2</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>af</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>4</th>\n",
 								       "      <td>250217</td>\n",
 								       "      <td>78785</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>11035.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>fr</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>...</th>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>471593</th>\n",
 								       "      <td>4976621</td>\n",
 								       "      <td>3</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>4732462.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>471594</th>\n",
 								       "      <td>4976636</td>\n",
 								       "      <td>3</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>4731717.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>2</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>471595</th>\n",
 								       "      <td>4976637</td>\n",
 								       "      <td>3</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>4731674.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>471596</th>\n",
 								       "      <td>4976645</td>\n",
 								       "      <td>3</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>4731549.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>2</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>471597</th>\n",
 								       "      <td>4976666</td>\n",
 								       "      <td>3</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>4731118.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>1771</td>\n",
 								       "      <td>False</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>True</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>0.0</td>\n",
 								       "      <td>0</td>\n",
 								       "      <td>NaN</td>\n",
 								       "      <td>NaN</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "<p>471598 rows × 22 columns</p>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "        customer_id  street_id  structure_id  mcp_contact_id  fidelity  \\\n",
 								       "0           6009745    1372685           NaN             NaN         0   \n",
 								       "1           6011228    1372685           NaN             NaN         0   \n",
 								       "2           6058950    1372685           NaN             NaN         0   \n",
 								       "3           6062404    1372685           NaN             NaN         0   \n",
 								       "4            250217      78785           NaN         11035.0         0   \n",
 								       "...             ...        ...           ...             ...       ...   \n",
 								       "471593      4976621          3           NaN       4732462.0         0   \n",
 								       "471594      4976636          3           NaN       4731717.0         0   \n",
 								       "471595      4976637          3           NaN       4731674.0         0   \n",
 								       "471596      4976645          3           NaN       4731549.0         0   \n",
 								       "471597      4976666          3           NaN       4731118.0         0   \n",
 								       "\n",
 								       "        tenant_id  is_partner  deleted_at  gender  is_email_true  ...  \\\n",
 								       "0            1771       False         NaN       2           True  ...   \n",
 								       "1            1771       False         NaN       2           True  ...   \n",
 								       "2            1771       False         NaN       2           True  ...   \n",
 								       "3            1771       False         NaN       2           True  ...   \n",
 								       "4            1771       False         NaN       0           True  ...   \n",
 								       "...           ...         ...         ...     ...            ...  ...   \n",
 								       "471593       1771       False         NaN       0           True  ...   \n",
 								       "471594       1771       False         NaN       2           True  ...   \n",
 								       "471595       1771       False         NaN       0           True  ...   \n",
 								       "471596       1771       False         NaN       2           True  ...   \n",
 								       "471597       1771       False         NaN       0           True  ...   \n",
 								       "\n",
 								       "        max_price ticket_sum  average_price  average_purchase_delay  \\\n",
 								       "0             NaN          0            NaN                     NaN   \n",
 								       "1             NaN          0            NaN                     NaN   \n",
 								       "2             NaN          0            NaN                     NaN   \n",
 								       "3             NaN          0            NaN                     NaN   \n",
 								       "4             NaN          0            0.0                     NaN   \n",
 								       "...           ...        ...            ...                     ...   \n",
 								       "471593        NaN          0            NaN                     NaN   \n",
 								       "471594        NaN          0            NaN                     NaN   \n",
 								       "471595        NaN          0            NaN                     NaN   \n",
 								       "471596        NaN          0            NaN                     NaN   \n",
 								       "471597        NaN          0            NaN                     NaN   \n",
 								       "\n",
 								       "        average_price_basket  average_ticket_basket  total_price  \\\n",
 								       "0                        NaN                    NaN          0.0   \n",
 								       "1                        NaN                    NaN          0.0   \n",
 								       "2                        NaN                    NaN          0.0   \n",
 								       "3                        NaN                    NaN          0.0   \n",
 								       "4                        NaN                    NaN          NaN   \n",
 								       "...                      ...                    ...          ...   \n",
 								       "471593                   NaN                    NaN          0.0   \n",
 								       "471594                   NaN                    NaN          0.0   \n",
 								       "471595                   NaN                    NaN          0.0   \n",
 								       "471596                   NaN                    NaN          0.0   \n",
 								       "471597                   NaN                    NaN          0.0   \n",
 								       "\n",
 								       "        purchase_count  first_buying_date  country  \n",
 								       "0                    0                NaN       af  \n",
 								       "1                    0                NaN       af  \n",
 								       "2                    0                NaN       af  \n",
 								       "3                    0                NaN       af  \n",
 								       "4                    0                NaN       fr  \n",
 								       "...                ...                ...      ...  \n",
 								       "471593               0                NaN      NaN  \n",
 								       "471594               0                NaN      NaN  \n",
 								       "471595               0                NaN      NaN  \n",
 								       "471596               0                NaN      NaN  \n",
 								       "471597               0                NaN      NaN  \n",
 								       "\n",
 								       "[471598 rows x 22 columns]"
 								      ]
 								     },
 								     "execution_count": 13,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
 								   "source": [
 								    "display_databases(\"5\", \"customerplus_cleaned\")"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 28,
 								   "id": "6c8ad8c3-25df-4fe4-9ad0-ee5f9498bc14",
 								   "metadata": {},
 								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>id</th>\n",
 								       "      <th>name</th>\n",
 								       "      <th>code</th>\n",
 								       "      <th>created_at</th>\n",
 								       "      <th>updated_at</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>0</th>\n",
 								       "      <td>101</td>\n",
 								       "      <td>hongrie</td>\n",
 								       "      <td>hu</td>\n",
 								       "      <td>2023-06-13 11:17:40.600622+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.600622+02:00</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>1</th>\n",
 								       "      <td>2</td>\n",
 								       "      <td>albanie</td>\n",
 								       "      <td>al</td>\n",
 								       "      <td>2023-06-13 11:17:40.540652+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.540652+02:00</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>2</th>\n",
 								       "      <td>3</td>\n",
 								       "      <td>antarctique</td>\n",
 								       "      <td>aq</td>\n",
 								       "      <td>2023-06-13 11:17:40.541315+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.541315+02:00</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>3</th>\n",
 								       "      <td>12</td>\n",
 								       "      <td>autriche</td>\n",
 								       "      <td>at</td>\n",
 								       "      <td>2023-06-13 11:17:40.546711+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.546711+02:00</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>4</th>\n",
 								       "      <td>5</td>\n",
 								       "      <td>samoa américaines</td>\n",
 								       "      <td>as</td>\n",
 								       "      <td>2023-06-13 11:17:40.542569+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.542569+02:00</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>...</th>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "      <td>...</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>238</th>\n",
 								       "      <td>228</td>\n",
 								       "      <td>royaume-uni</td>\n",
 								       "      <td>gb</td>\n",
 								       "      <td>2023-06-13 11:17:40.678023+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.678023+02:00</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>239</th>\n",
 								       "      <td>25</td>\n",
 								       "      <td>brésil</td>\n",
 								       "      <td>br</td>\n",
 								       "      <td>2023-06-13 11:17:40.554209+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.554209+02:00</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>240</th>\n",
 								       "      <td>10</td>\n",
 								       "      <td>argentine</td>\n",
 								       "      <td>ar</td>\n",
 								       "      <td>2023-06-13 11:17:40.545489+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.545489+02:00</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>241</th>\n",
 								       "      <td>203</td>\n",
 								       "      <td>espagne</td>\n",
 								       "      <td>es</td>\n",
 								       "      <td>2023-06-13 11:17:40.662472+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.662472+02:00</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
 								       "      <th>242</th>\n",
 								       "      <td>192</td>\n",
 								       "      <td>arabie saoudite</td>\n",
 								       "      <td>sa</td>\n",
 								       "      <td>2023-06-13 11:17:40.656154+02:00</td>\n",
 								       "      <td>2023-06-13 11:17:40.656154+02:00</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "<p>243 rows × 5 columns</p>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "      id               name code                        created_at  \\\n",
 								       "0    101            hongrie   hu  2023-06-13 11:17:40.600622+02:00   \n",
 								       "1      2            albanie   al  2023-06-13 11:17:40.540652+02:00   \n",
 								       "2      3        antarctique   aq  2023-06-13 11:17:40.541315+02:00   \n",
 								       "3     12           autriche   at  2023-06-13 11:17:40.546711+02:00   \n",
 								       "4      5  samoa américaines   as  2023-06-13 11:17:40.542569+02:00   \n",
 								       "..   ...                ...  ...                               ...   \n",
 								       "238  228        royaume-uni   gb  2023-06-13 11:17:40.678023+02:00   \n",
 								       "239   25             brésil   br  2023-06-13 11:17:40.554209+02:00   \n",
 								       "240   10          argentine   ar  2023-06-13 11:17:40.545489+02:00   \n",
 								       "241  203            espagne   es  2023-06-13 11:17:40.662472+02:00   \n",
 								       "242  192    arabie saoudite   sa  2023-06-13 11:17:40.656154+02:00   \n",
 								       "\n",
 								       "                           updated_at  \n",
 								       "0    2023-06-13 11:17:40.600622+02:00  \n",
 								       "1    2023-06-13 11:17:40.540652+02:00  \n",
 								       "2    2023-06-13 11:17:40.541315+02:00  \n",
 								       "3    2023-06-13 11:17:40.546711+02:00  \n",
 								       "4    2023-06-13 11:17:40.542569+02:00  \n",
 								       "..                                ...  \n",
 								       "238  2023-06-13 11:17:40.678023+02:00  \n",
 								       "239  2023-06-13 11:17:40.554209+02:00  \n",
 								       "240  2023-06-13 11:17:40.545489+02:00  \n",
 								       "241  2023-06-13 11:17:40.662472+02:00  \n",
 								       "242  2023-06-13 11:17:40.656154+02:00  \n",
 								       "\n",
 								       "[243 rows x 5 columns]"
 								      ]
 								     },
 								     "execution_count": 28,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
 								   "source": [
 								    "load_dataset_2(\"7\", \"countries\")"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "id": "ca2c8b6a-4965-422e-ba7c-66423a464fc1",
 								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
 								   "source": [
 								    "## Base communes au types Musée"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "f8f988fb-5aab-4b57-80d1-e242f7e5b384",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "companies = {'musee' : ['1', '2', '3', '4', '101'],\n",
 								    "            'sport': ['5', '6', '7', '8', '9'],\n",
 								    "            'musique' : ['10', '11', '12', '13', '14']}"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "dbce1124-9a22-4502-a47a-fc3d0e2db70b",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "companies['musee']"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "5080f66e-f779-410a-876d-b4fe2795e17e",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "for i in companies['musique']:\n",
 								    "    BUCKET = \"bdc2324-data/\"+i\n",
 								    "    liste_base = []\n",
 								    "    for base in fs.ls(BUCKET):\n",
 								    "        match = re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', base)\n",
 								    "        if match:\n",
 								    "            nom_base = match.group(3)\n",
 								    "            liste_base.append(nom_base)\n",
 								    "    globals()['base_'+i] = liste_base\n"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "abd477e1-7479-4c88-a5aa-f987af3f5b79",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "# Trouver l'intersection entre les cinq listes\n",
 								    "intersection = set(base_1).intersection(base_2, base_3, base_4, base_101)\n",
 								    "\n",
 								    "# Convertir le résultat en liste si nécessaire\n",
 								    "intersection_liste = list(intersection)\n",
 								    "\n",
 								    "print(intersection_liste)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "8d93888f-a511-4ee5-8bc3-d5173a7f119e",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "# Trouver l'intersection entre les cinq listes\n",
 								    "intersection = set(base_10).intersection(base_12, base_13, base_14, base_11)\n",
 								    "\n",
 								    "# Convertir le résultat en liste si nécessaire\n",
 								    "intersection_liste = list(intersection)\n",
 								    "\n",
 								    "print(intersection_liste)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "10e89669-42bb-4652-a4bc-1a3d1caf4d1a",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "len(intersection_liste)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "e0aa8976-1487-4ef5-898e-0d6a88183e67",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": []
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "67f02868-b16a-41d5-a0f9-b31ce09278db",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "base_101"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "7d058b21-a538-4f59-aefb-ef7966f73fdc",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
-												Exploration tags et target

											
										
										
											2024-02-25 18:33:24 +01:00
+								   "outputs": [],
 								   "source": [
 								    "df1_tags = load_dataset_2(\"1\", \"tags\")"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "aa441f99-733c-4675-8676-bed4682d3324",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "df1_structure_tag_mappings = load_dataset_2(\"1\", 'structure_tag_mappings')"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "6767a750-14a4-4c05-903e-d2f07170825b",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "df1_customersplus = load_dataset_2(\"1\", \"customersplus\")"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "125e9145-a815-46fd-bdf4-07589508b259",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "df1_customersplus.groupby('structure_id')['id'].count().reset_index().sort_values('id', ascending=False).head(20)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "c17a6976-792f-474d-bcff-c89396eddb3f",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "df1_customersplus['structure_id'].isna().sum() / len(df1_customersplus['structure_id'])"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "ecfc155a-cb42-46ec-8da5-33fdcd087355",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "len(df1_structure_tag_mappings)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "071410b8-950d-4fcc-b2b9-57415253c286",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "df1_structure_tag_mappings.groupby('tag_id')['structure_id'].count().reset_index().sort_values('structure_id', ascending=False).head(20)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "f48d27a9-14e4-4bb9-a60a-73e9438b58fc",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "?np.sort_values()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "14eaa0ea-02cc-430b-ab9b-38e6637810c3",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "def info_colonnes_dataframe(df):\n",
 								    "    # Créer une liste pour stocker les informations sur chaque colonne\n",
 								    "    infos_colonnes = []\n",
 								    "\n",
 								    "    # Parcourir les colonnes du DataFrame\n",
 								    "    for nom_colonne, serie in df.items():  # Utiliser items() au lieu de iteritems()\n",
 								    "        # Calculer le taux de valeurs manquantes\n",
 								    "        taux_na = serie.isna().mean() * 100\n",
 								    "\n",
 								    "        # Ajouter les informations à la liste\n",
 								    "        infos_colonnes.append({\n",
 								    "            'Nom_colonne': nom_colonne,\n",
 								    "            'Type_colonne': str(serie.dtype),\n",
 								    "            'Taux_NA': taux_na\n",
 								    "        })\n",
 								    "\n",
 								    "    # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
 								    "    df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
 								    "\n",
 								    "    return df_infos_colonnes"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "6b031c32-d4c8-42a5-9a71-a7810f9bf8d8",
 								   "metadata": {
 								    "scrolled": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "info_colonnes_dataframe(df1_tags)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "e1a87f27-c4d4-4832-ac20-0c3c54aa4980",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "info_colonnes_dataframe(df1_structure_tag_mappings)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "fa5c65a8-2f74-4f3f-85fc-9ac91e0bb361",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "pd.set_option('display.max_colwidth', None)\n",
 								    "\n",
 								    "print(df1_tags['name'])"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "a59bf932-5b54-4600-81f5-c55ac93ae510",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "pd.set_option('display.max_rows', None)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
 								   "id": "a4ab298e-2cae-4865-9f00-4caff5f75ea1",
 								   "metadata": {
 								    "scrolled": true
 								   },
 								   "outputs": [],
 								   "source": [
 								    "print(df1_tags['name'])"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "id": "76bffba1-5f7e-4308-9224-437ca66148f8",
 								   "metadata": {},
 								   "source": [
 								    "## KPI sur target_type"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 74,
 								   "id": "622752ed-b565-4188-86d6-38f1f333fcbe",
 								   "metadata": {},
 								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "File path :  projet-bdc2324-team1/0_Input/Company_1/target_information.csv\n"
 								     ]
 								    },
 								    {
 								     "ename": "PermissionError",
 								     "evalue": "Forbidden",
 								     "output_type": "error",
 								     "traceback": [
 								      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 								      "\u001b[0;31mClientError\u001b[0m                               Traceback (most recent call last)",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:529\u001b[0m, in \u001b[0;36mS3FileSystem.info\u001b[0;34m(self, path, version_id, refresh)\u001b[0m\n\u001b[1;32m    528\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 529\u001b[0m     out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_s3\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43ms3\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhead_object\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mBucket\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbucket\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    530\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mKey\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mversion_id_kw\u001b[49m\u001b[43m(\u001b[49m\u001b[43mversion_id\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreq_kw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    531\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m    532\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mETag\u001b[39m\u001b[38;5;124m'\u001b[39m: out[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mETag\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m    533\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mKey\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin([bucket, key]),\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    540\u001b[0m         \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mVersionId\u001b[39m\u001b[38;5;124m'\u001b[39m: out\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mVersionId\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m    541\u001b[0m     }\n",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:200\u001b[0m, in \u001b[0;36mS3FileSystem._call_s3\u001b[0;34m(self, method, *akwarglist, **kwargs)\u001b[0m\n\u001b[1;32m    198\u001b[0m additional_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_s3_method_kwargs(method, \u001b[38;5;241m*\u001b[39makwarglist,\n\u001b[1;32m    199\u001b[0m                                                \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 200\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43madditional_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/botocore/client.py:553\u001b[0m, in \u001b[0;36mClientCreator._create_api_method.<locals>._api_call\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    552\u001b[0m \u001b[38;5;66;03m# The \"self\" in this scope is referring to the BaseClient.\u001b[39;00m\n\u001b[0;32m--> 553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_api_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43moperation_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/botocore/client.py:1009\u001b[0m, in \u001b[0;36mBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m   1008\u001b[0m     error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m-> 1009\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m   1010\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
 								      "\u001b[0;31mClientError\u001b[0m: An error occurred (403) when calling the HeadObject operation: Forbidden",
 								      "\nDuring handling of the above exception, another exception occurred:\n",
 								      "\u001b[0;31mPermissionError\u001b[0m                           Traceback (most recent call last)",
 								      "Cell \u001b[0;32mIn[74], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdisplay_databases\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m1\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtarget_information\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
 								      "File \u001b[0;32m<string>:12\u001b[0m, in \u001b[0;36mdisplay_databases\u001b[0;34m(directory_path, file_name, datetime_col)\u001b[0m\n",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py:1295\u001b[0m, in \u001b[0;36mAbstractFileSystem.open\u001b[0;34m(self, path, mode, block_size, cache_options, compression, **kwargs)\u001b[0m\n\u001b[1;32m   1293\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   1294\u001b[0m     ac \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mautocommit\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_intrans)\n\u001b[0;32m-> 1295\u001b[0m     f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1296\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1297\u001b[0m \u001b[43m        \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1298\u001b[0m \u001b[43m        \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1299\u001b[0m \u001b[43m        \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mac\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1300\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcache_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1301\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1302\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1303\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m compression \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1304\u001b[0m         \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mfsspec\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mcompression\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m compr\n",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:375\u001b[0m, in \u001b[0;36mS3FileSystem._open\u001b[0;34m(self, path, mode, block_size, acl, version_id, fill_cache, cache_type, autocommit, requester_pays, **kwargs)\u001b[0m\n\u001b[1;32m    372\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m cache_type \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    373\u001b[0m     cache_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefault_cache_type\n\u001b[0;32m--> 375\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mS3File\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43macl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43macl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    376\u001b[0m \u001b[43m              \u001b[49m\u001b[43mversion_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mversion_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfill_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfill_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    377\u001b[0m \u001b[43m              \u001b[49m\u001b[43ms3_additional_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    378\u001b[0m \u001b[43m              \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrequester_pays\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequester_pays\u001b[49m\u001b[43m)\u001b[49m\n",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:1096\u001b[0m, in \u001b[0;36mS3File.__init__\u001b[0;34m(self, s3, path, mode, block_size, acl, version_id, fill_cache, s3_additional_kwargs, autocommit, cache_type, requester_pays)\u001b[0m\n\u001b[1;32m   1094\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39ms3_additional_kwargs \u001b[38;5;241m=\u001b[39m s3_additional_kwargs \u001b[38;5;129;01mor\u001b[39;00m {}\n\u001b[1;32m   1095\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreq_kw \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mRequestPayer\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrequester\u001b[39m\u001b[38;5;124m'\u001b[39m} \u001b[38;5;28;01mif\u001b[39;00m requester_pays \u001b[38;5;28;01melse\u001b[39;00m {}\n\u001b[0;32m-> 1096\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43ms3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mblock_size\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mautocommit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mautocommit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1097\u001b[0m \u001b[43m                 \u001b[49m\u001b[43mcache_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_type\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1098\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39ms3 \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfs  \u001b[38;5;66;03m# compatibility\u001b[39;00m\n\u001b[1;32m   1099\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwritable():\n",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py:1651\u001b[0m, in \u001b[0;36mAbstractBufferedFile.__init__\u001b[0;34m(self, fs, path, mode, block_size, autocommit, cache_type, cache_options, size, **kwargs)\u001b[0m\n\u001b[1;32m   1649\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize \u001b[38;5;241m=\u001b[39m size\n\u001b[1;32m   1650\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1651\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdetails\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msize\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m   1652\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache \u001b[38;5;241m=\u001b[39m caches[cache_type](\n\u001b[1;32m   1653\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mblocksize, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fetch_range, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msize, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcache_options\n\u001b[1;32m   1654\u001b[0m     )\n\u001b[1;32m   1655\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/fsspec/spec.py:1664\u001b[0m, in \u001b[0;36mAbstractBufferedFile.details\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1661\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m   1662\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdetails\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m   1663\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_details \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1664\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_details \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfo\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1665\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_details\n",
 								      "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/s3fs/core.py:548\u001b[0m, in \u001b[0;36mS3FileSystem.info\u001b[0;34m(self, path, version_id, refresh)\u001b[0m\n\u001b[1;32m    546\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m(S3FileSystem, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39minfo(path)\n\u001b[1;32m    547\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 548\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m ee\n\u001b[1;32m    549\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ParamValidationError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    550\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFailed to head path \u001b[39m\u001b[38;5;132;01m%r\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m%\u001b[39m (path, e))\n",
 								      "\u001b[0;31mPermissionError\u001b[0m: Forbidden"
 								     ]
 								    }
 								   ],
 								   "source": [
 								    "display_databases('1', 'target_information')"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "id": "1ede9eaa-7f0a-4856-9349-b2747d6a4901",
 								   "metadata": {},
 								   "source": [
 								    "# Fin travail 25/02"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "id": "c437eaec",
 								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								    "# Exemple sur Company 1"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "a1c1fc39",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "metadata": {},
 								   "source": [
 								    "## Chargement données"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Renommer notebook

											
										
										
											2024-01-13 14:14:11 +01:00
+								   "execution_count": 3,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "66f8c17b",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "outputs": [],
 								   "source": [
 								    "BUCKET = \"bdc2324-data/1\"\n",
 								    "liste_database = fs.ls(BUCKET)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "execution_count": 5,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "c08e6798",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "source": [
-												Renommer notebook

											
										
										
											2024-01-13 14:14:11 +01:00
+								    "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'type_ofs']\n",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								    "\n",
 								    "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
 								    "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
 								    "\n",
 								    "# Afficher le résultat\n",
 								    "print(liste_database_filtered)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "execution_count": 6,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "675f518d",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "metadata": {},
 								   "outputs": [
 								    {
 								     "name": "stderr",
 								     "output_type": "stream",
 								     "text": [
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								      "<<<<<<< local <modified: >\n",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								      "/tmp/ipykernel_445/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								      "  df = pd.read_csv(file_in)\n",
 								      "=======\n",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								      "/tmp/ipykernel_15285/4081512283.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								      "  df = pd.read_csv(file_in)\n",
 								      ">>>>>>> remote <modified: >\n"
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								     ]
 								    }
 								   ],
 								   "source": [
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								    "# loop to create dataframes from liste\n",
 								    "files_path = liste_database\n",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								    "\n",
 								    "client_number = files_path[0].split(\"/\")[1]\n",
 								    "df_prefix = \"df\" + str(client_number) + \"_\"\n",
 								    "\n",
 								    "for i in range(len(files_path)) :\n",
 								    "    current_path = files_path[i]\n",
 								    "    with fs.open(current_path, mode=\"rb\") as file_in:\n",
 								    "        df = pd.read_csv(file_in)\n",
 								    "        # the pattern of the name is df1xxx\n",
 								    "        nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
 								    "        globals()[nom_dataframe] = df"
 								   ]
 								  },
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "e855f403",
-												New structure

											
										
										
											2024-02-19 23:11:28 +01:00
+								   "metadata": {},
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "source": [
 								    "## customersplus.csv"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 22,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "91a8f8c4",
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "source": [
 								    "a = pd.DataFrame(df1_customersplus.info())"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 31,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "2fda171d",
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "def info_colonnes_dataframe(df):\n",
 								    "    # Créer une liste pour stocker les informations sur chaque colonne\n",
 								    "    infos_colonnes = []\n",
 								    "\n",
 								    "    # Parcourir les colonnes du DataFrame\n",
 								    "    for nom_colonne, serie in df.items():  # Utiliser items() au lieu de iteritems()\n",
 								    "        # Calculer le taux de valeurs manquantes\n",
 								    "        taux_na = serie.isna().mean() * 100\n",
 								    "\n",
 								    "        # Ajouter les informations à la liste\n",
 								    "        infos_colonnes.append({\n",
 								    "            'Nom_colonne': nom_colonne,\n",
 								    "            'Type_colonne': str(serie.dtype),\n",
 								    "            'Taux_NA': taux_na\n",
 								    "        })\n",
 								    "\n",
 								    "    # Créer une nouvelle DataFrame à partir de la liste d'informations\n",
 								    "    df_infos_colonnes = pd.DataFrame(infos_colonnes)\n",
 								    "\n",
 								    "    return df_infos_colonnes"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 35,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "205eeeab",
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "def cleaning_date(df, column_name):\n",
 								    "    \"\"\"\n",
 								    "    Nettoie la colonne spécifiée du DataFrame en convertissant les valeurs en datetime avec le format ISO8601.\n",
 								    "\n",
 								    "    Parameters:\n",
 								    "    - df: DataFrame\n",
 								    "        Le DataFrame contenant la colonne à nettoyer.\n",
 								    "    - column_name: str\n",
 								    "        Le nom de la colonne à nettoyer.\n",
 								    "\n",
 								    "    Returns:\n",
 								    "    - DataFrame\n",
 								    "        Le DataFrame modifié avec la colonne nettoyée.\n",
 								    "    \"\"\"\n",
 								    "    df[column_name] = pd.to_datetime(df[column_name], utc = True, format = 'ISO8601')\n",
 								    "    return df"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 32,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "634282c5",
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "a = info_colonnes_dataframe(df1_customersplus)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 33,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "0e8d4133",
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "source": [
 								    "a"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 16,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "1268ad5a",
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "a = pd.DataFrame(df1_customersplus.isna().sum()/len(df1_customersplus)*100)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 40,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "bd41dc80",
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								   "source": [
 								    "# Selection des variables\n",
 								    "df1_customersplus_clean = df1_customersplus.copy()\n",
 								    "\n",
 								    "cleaning_date(df1_customersplus_clean, 'first_buying_date')\n",
 								    "cleaning_date(df1_customersplus_clean, 'last_visiting_date')\n",
 								    "\n",
 								    "df1_customersplus_clean.drop(['lastname', 'firstname', 'email', 'civility', 'note', 'created_at', 'updated_at', 'deleted_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode', 'last_visiting_date'], axis = 1, inplace=True)\n",
 								    "df1_customersplus_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)\n",
 								    "\n"
 								   ]
 								  },
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "64d0f76b",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "source": [
 								    "## tickets.csv"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 6,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "7e683711",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "source": [
 								    "df1_tickets"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 7,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "e7b9a52e",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "source": [
 								    "df1_tickets.info()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 8,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "568280e8",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "source": [
 								    "df1_tickets.isna().sum()/len(df1_tickets)*100"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 9,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "29ecec90",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "source": [
 								    "# Selection des variables\n",
-												update

											
										
										
											2024-02-05 22:04:02 +01:00
+								    "df1_tickets_clean = df1_tickets.drop(['lastname', 'firstname', 'email', 'created_at', 'updated_at', 'extra', 'reference', 'extra_field', 'identifier', 'need_reload', 'preferred_category', 'preferred_supplier', 'preferred_formula', 'zipcode'], axis = 1, inplace=True)\n",
 								    "df1_tickets_clean.rename(columns = {'id' : 'customer_id'}, inplace = True)"
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   ]
 								  },
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "22bb5de4",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "source": [
 								    "## suppliers.csv"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "execution_count": 10,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "6a9a91f4",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "source": [
 								    "df1_suppliers"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "execution_count": 11,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "bab4758a",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "source": [
 								    "df1_suppliers.info()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "execution_count": 12,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "b5fff251",
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "df1_suppliers.isna().sum()/len(df1_suppliers)*100"
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   ]
 								  },
-												Renommer notebook

											
										
										
											2024-01-13 14:14:11 +01:00
+								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 13,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "8b09e2a3",
-												Renommer notebook

											
										
										
											2024-01-13 14:14:11 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "# Selection des variables\n",
 								    "df1_suppliers_clean = df1_suppliers[['id', 'name']]\n",
 								    "df1_suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)"
-												Exploration suppliers.csv

											
										
										
											2024-01-13 10:38:10 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 14,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "ecee7cdc",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "df1_suppliers_clean"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "c8e6e69b",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "source": [
 								    "## type_ofs.csv"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "execution_count": 15,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "1a6cff1f",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "df1_type_ofs"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "execution_count": 16,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "93630b41",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "df1_type_ofs.info()"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 17,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "4f94481a",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "source": [
 								    "# Selection des variables\n",
 								    "df1_type_ofs_clean = df1_type_ofs[['id', 'name', 'children']]\n",
 								    "df1_type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "1b2811e2",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "source": [
 								    "## purchases.csv"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 18,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "2455d2e1",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "metadata": {
 								    "scrolled": true
 								   },
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "df1_purchases"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 19,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "5f9a159d",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "df1_purchases.info()"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 20,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "db201bf7",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "# Nettoyage purchase_date\n",
 								    "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], utc = True)\n",
 								    "df1_purchases['purchase_date'] = pd.to_datetime(df1_purchases['purchase_date'], format = 'ISO8601')"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 21,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "bd436fca",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "df1_purchases.info()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 22,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "83435862",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "# Selection des variables\n",
 								    "df1_purchases_clean = df1_purchases[['id', 'purchase_date', 'customer_id']]"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "f210e730",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "source": [
 								    "## Fusion de l'ensemble des données billétiques"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 23,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "1f8b3aa7",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "# Fusion avec fournisseurs\n",
 								    "df1_ticket_information = pd.merge(df1_tickets_clean, df1_suppliers_clean, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n",
 								    "df1_ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								    "\n",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "# Fusion avec type de tickets\n",
 								    "df1_ticket_information = pd.merge(df1_ticket_information, df1_type_ofs_clean, left_on = 'type_of', right_on = 'id', how = 'inner')\n",
 								    "df1_ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								    "\n",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "# Fusion avec achats\n",
 								    "df1_ticket_information = pd.merge(df1_ticket_information, df1_purchases_clean, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n",
 								    "df1_ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "execution_count": 24,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "83a4d021",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "metadata": {
 								    "scrolled": true
 								   },
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "source": [
 								    "df1_ticket_information"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "56e6ebd1",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "source": [
 								    "# Utilisation de fonctions"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "execution_count": 51,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "88fcde4b",
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "source": [
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								    "# Créer un DataFrame exemple\n",
 								    "df_not_clean = df1_campaign_stats[['opened_at']].head(20)\n",
 								    "\n",
 								    "# Appliquer la fonction pour nettoyer la colonne 'purchase_date' de manière vectorisée\n",
 								    "df_clean = cleaning_date(df_not_clean, 'opened_at')\n",
 								    "df_clean.rename(columns = {'opened_at' : 'opened_at_clean'}, inplace = True)\n",
 								    "\n",
 								    "test = pd.concat([df1_campaign_stats[['opened_at']].head(20), df_clean], axis=1)\n",
 								    "\n",
 								    "test.info()"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "818f69db",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
 								   "source": [
 								    "## Nettoyage, selection et fusion"
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "execution_count": 23,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "c9654eda",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   "source": [
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								    "df1_ticket_information"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								   ]
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								  },
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								  {
 								   "cell_type": "code",
 								   "execution_count": 14,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "7f2b620c",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "source": [
 								    "df1_ticket_information.info()"
 								   ]
 								  },
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "637bdb72",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "metadata": {},
 								   "source": [
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								    "# Customer information"
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   ]
 								  },
 								  {
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "14c52894",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "source": [
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								    "## Target area"
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								   "execution_count": 8,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "d83abfbf",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "metadata": {},
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "outputs": [
 								    {
 								     "name": "stderr",
 								     "output_type": "stream",
 								     "text": [
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								      "/tmp/ipykernel_15285/2625134041.py:3: SettingWithCopyWarning: \n",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								      "A value is trying to be set on a copy of a slice from a DataFrame\n",
 								      "\n",
 								      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
 								      "  df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n"
 								     ]
 								    }
 								   ],
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "source": [
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								    "# Target.csv cleaning\n",
 								    "df1_targets_clean = df1_targets[[\"id\", \"target_type_id\", \"name\"]]\n",
 								    "df1_targets_clean.rename(columns = {'id' : 'target_id' , 'name' : 'target_name'}, inplace = True)\n",
 								    "\n",
 								    "# target_type cleaning\n",
 								    "df1_target_types_clean = df1_target_types[[\"id\",\"is_import\",\"name\"]].add_prefix(\"target_type_\")\n",
 								    "\n",
 								    "#customer_target_mappings cleaning\n",
 								    "df1_customer_target_mappings_clean = df1_customer_target_mappings[[\"id\", \"customer_id\", \"target_id\"]]\n",
 								    "\n",
 								    "# Merge target et target_type\n",
 								    "df1_targets_full = pd.merge(df1_targets_clean, df1_target_types_clean, left_on='target_type_id', right_on='target_type_id', how='inner')\n",
 								    "df1_targets_full.drop(['target_type_id'], axis = 1, inplace=True)\n",
 								    "\n",
 								    "# Merge\n",
 								    "df1_targets_full = pd.merge(df1_customer_target_mappings_clean, df1_targets_full, left_on='target_id', right_on='target_id', how='inner')\n",
 								    "df1_targets_full.drop(['target_id'], axis = 1, inplace=True)"
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "execution_count": 62,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "90d71b2c",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "source": [
 								    "df1_targets_test = df1_targets_full[['id', 'customer_id']].groupby(['customer_id']).count()\n",
 								    "len(df1_targets_test[df1_targets_test['id'] > 1]) / len(df1_targets_test)\n",
 								    "\n",
 								    "# 99,6% des 151 000 client visés sont catégorisés plusieurs fois et en moyenne 5 fois... \n",
 								    "df1_targets_test.mean()\n"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								   "execution_count": 10,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "2301de1e",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								   "metadata": {},
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
 								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>id</th>\n",
 								       "      <th>customer_id</th>\n",
 								       "      <th>target_name</th>\n",
 								       "      <th>target_type_is_import</th>\n",
 								       "      <th>target_type_name</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								       "      <th>0</th>\n",
 								       "      <td>1184824</td>\n",
 								       "      <td>645400</td>\n",
 								       "      <td>DDCP PROMO Réseau livres</td>\n",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								       "      <td>False</td>\n",
 								       "      <td>manual_static_filter</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								       "      <th>1</th>\n",
 								       "      <td>210571</td>\n",
 								       "      <td>2412</td>\n",
 								       "      <td>DDCP PROMO Réseau livres</td>\n",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								       "      <td>False</td>\n",
 								       "      <td>manual_static_filter</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								       "      <th>2</th>\n",
 								       "      <td>210572</td>\n",
 								       "      <td>4536</td>\n",
 								       "      <td>DDCP PROMO Réseau livres</td>\n",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								       "      <td>False</td>\n",
 								       "      <td>manual_static_filter</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								       "      <th>3</th>\n",
 								       "      <td>210573</td>\n",
 								       "      <td>6736</td>\n",
 								       "      <td>DDCP PROMO Réseau livres</td>\n",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								       "      <td>False</td>\n",
 								       "      <td>manual_static_filter</td>\n",
 								       "    </tr>\n",
 								       "    <tr>\n",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								       "      <th>4</th>\n",
 								       "      <td>210574</td>\n",
 								       "      <td>38210</td>\n",
 								       "      <td>DDCP PROMO Réseau livres</td>\n",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								       "      <td>False</td>\n",
 								       "      <td>manual_static_filter</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								       "        id  customer_id               target_name  target_type_is_import  \\\n",
 								       "0  1184824       645400  DDCP PROMO Réseau livres                  False   \n",
 								       "1   210571         2412  DDCP PROMO Réseau livres                  False   \n",
 								       "2   210572         4536  DDCP PROMO Réseau livres                  False   \n",
 								       "3   210573         6736  DDCP PROMO Réseau livres                  False   \n",
 								       "4   210574        38210  DDCP PROMO Réseau livres                  False   \n",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								       "\n",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								       "       target_type_name  \n",
 								       "0  manual_static_filter  \n",
 								       "1  manual_static_filter  \n",
 								       "2  manual_static_filter  \n",
 								       "3  manual_static_filter  \n",
 								       "4  manual_static_filter  "
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								      ]
 								     },
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								     "execution_count": 10,
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
 								   "source": [
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								    "df1_targets_full.head()"
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   ]
 								  },
 								  {
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								   "cell_type": "code",
 								   "execution_count": 14,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "75fbc2f7",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								   "metadata": {},
 								   "outputs": [
 								    {
 								     "name": "stderr",
 								     "output_type": "stream",
 								     "text": [
 								      "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
 								      "[nltk_data]   Package punkt is already up-to-date!\n",
 								      "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
 								      "[nltk_data]   Package stopwords is already up-to-date!\n",
 								      "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
 								      "[nltk_data]   Package wordnet is already up-to-date!\n"
 								     ]
 								    },
 								    {
 								     "data": {
 								      "text/plain": [
 								       "True"
 								      ]
 								     },
 								     "execution_count": 14,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "source": [
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								    "# Catégorisation des target_name\n",
 								    "import pandas as pd\n",
 								    "import nltk\n",
 								    "from nltk.tokenize import word_tokenize\n",
 								    "from nltk.corpus import stopwords\n",
 								    "from nltk.stem import WordNetLemmatizer\n",
 								    "from nltk.probability import FreqDist\n",
 								    "\n",
 								    "# Téléchargement des ressources nécessaires\n",
 								    "nltk.download('punkt')\n",
 								    "nltk.download('stopwords')\n",
 								    "nltk.download('wordnet')\n",
 								    "\n"
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								   "execution_count": 19,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "55cddf92",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
 								   "outputs": [
 								    {
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								     "name": "stdout",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								     "output_type": "stream",
 								     "text": [
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								      "Mots les plus fréquents:\n",
 								      "consentement: 550777\n",
 								      "optin: 463579\n",
 								      "jeune: 155103\n",
 								      "public: 155103\n",
 								      "mediation: 150001\n"
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								     ]
 								    }
 								   ],
 								   "source": [
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								    "# Définition des fonctions de tokenisation, suppression des mots vides et lemmatisation\n",
 								    "def preprocess_text(texte):\n",
 								    "    # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
 								    "    texte_concat = ' '.join(texte)\n",
 								    "    \n",
 								    "    # Tokenisation des mots\n",
 								    "    tokens = word_tokenize(texte_concat.lower())\n",
 								    "    \n",
 								    "    # Suppression des mots vides (stopwords)\n",
 								    "    stop_words = set(stopwords.words('french'))\n",
 								    "    filtered_tokens = [word for word in tokens if word not in stop_words]\n",
 								    "    \n",
 								    "    # Lemmatisation des mots\n",
 								    "    lemmatizer = WordNetLemmatizer()\n",
 								    "    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
 								    "    \n",
 								    "    return lemmatized_tokens\n",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								    "\n",
 								    "\n",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								    "# Appliquer le prétraitement à la colonne de texte\n",
 								    "df1_targets_full['target_name_tokened'] = df1_targets_full['target_name'].apply(preprocess_text)\n",
 								    "\n",
 								    "# Concaténer les listes de mots pour obtenir une liste de tous les mots dans le corpus\n",
 								    "all_words = [word for tokens in df1_targets_full['target_name_tokened'] for word in tokens]\n",
 								    "\n",
 								    "# Calculer la fréquence des mots\n",
 								    "freq_dist = FreqDist(all_words)\n",
 								    "\n",
 								    "\n"
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								   "execution_count": 22,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "7fd98a85",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
 								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								      "Mots les plus fréquents:\n",
 								      "consentement: 550777\n",
 								      "optin: 463579\n",
 								      "jeune: 155103\n",
 								      "public: 155103\n",
 								      "mediation: 150001\n",
 								      "specialisee: 150001\n",
 								      "b2c: 143432\n",
 								      "optout: 97683\n",
 								      "newsletter: 56022\n",
 								      "(: 46084\n",
 								      "): 46084\n",
 								      "inscrits: 42296\n",
 								      "nl: 42294\n",
 								      "générale: 41037\n",
 								      "generale: 40950\n"
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								     ]
 								    }
 								   ],
 								   "source": [
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								    "# Affichage des mots les plus fréquents\n",
 								    "print(\"Mots les plus fréquents:\")\n",
 								    "for mot, freq in freq_dist.most_common(15):\n",
 								    "    print(f\"{mot}: {freq}\")"
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								   "execution_count": 18,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "cf94bb1d",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
 								   "outputs": [
 								    {
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "                            texte  \\\n",
 								      "0  Le chat noir mange une souris.   \n",
 								      "1           Le chien blanc aboie.   \n",
 								      "\n",
 								      "                                 texte_preprocessed  \n",
 								      "0  [e, h, a, o, i, r, a, g, e, u, e, o, u, r, i, .]  \n",
 								      "1              [e, h, i, e, b, a, a, b, o, i, e, .]  \n"
 								     ]
 								    },
 								    {
 								     "name": "stderr",
 								     "output_type": "stream",
 								     "text": [
 								      "[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...\n",
 								      "[nltk_data]   Package punkt is already up-to-date!\n",
 								      "[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...\n",
 								      "[nltk_data]   Package stopwords is already up-to-date!\n",
 								      "[nltk_data] Downloading package wordnet to /home/onyxia/nltk_data...\n",
 								      "[nltk_data]   Package wordnet is already up-to-date!\n"
 								     ]
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								    }
 								   ],
-												Ajout KPI + debut traitement NLP

											
										
										
											2024-02-07 23:28:55 +01:00
+								   "source": [
 								    "import pandas as pd\n",
 								    "import nltk\n",
 								    "from nltk.tokenize import word_tokenize\n",
 								    "from nltk.corpus import stopwords\n",
 								    "from nltk.stem import WordNetLemmatizer\n",
 								    "\n",
 								    "# Téléchargement des ressources nécessaires\n",
 								    "nltk.download('punkt')\n",
 								    "nltk.download('stopwords')\n",
 								    "nltk.download('wordnet')\n",
 								    "\n",
 								    "# Création de la DataFrame d'exemple\n",
 								    "data = {'texte': [\"Le chat noir mange une souris.\", \"Le chien blanc aboie.\"]}\n",
 								    "df = pd.DataFrame(data)\n",
 								    "\n",
 								    "# Fonction pour prétraiter le texte\n",
 								    "def preprocess_text(texte):\n",
 								    "    # Concaténation des éléments de la liste en une seule chaîne de caractères\n",
 								    "    texte_concat = ' '.join(texte)\n",
 								    "    \n",
 								    "    # Tokenisation des mots\n",
 								    "    tokens = word_tokenize(texte_concat.lower())\n",
 								    "    \n",
 								    "    # Suppression des mots vides (stopwords)\n",
 								    "    stop_words = set(stopwords.words('french'))\n",
 								    "    filtered_tokens = [word for word in tokens if word not in stop_words]\n",
 								    "    \n",
 								    "    # Lemmatisation des mots\n",
 								    "    lemmatizer = WordNetLemmatizer()\n",
 								    "    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]\n",
 								    "    \n",
 								    "    return lemmatized_tokens\n",
 								    "\n",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								    "# Appliquer la fonction de prétraitement à la colonne de texte\n",
 								    "df['texte_preprocessed'] = df['texte'].apply(preprocess_text)\n",
 								    "\n",
 								    "# Afficher le résultat\n",
 								    "print(df)\n"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
 								   "id": "711d3884",
 								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
 								   "source": [
 								    "## Campaign area"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 52,
 								   "id": "c25b5295",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "# campaign_stats cleaning \n",
 								    "df1_campaign_stats_clean = df1_campaign_stats[[\"id\", \"campaign_id\", \"customer_id\", \"opened_at\", \"sent_at\", \"delivered_at\"]]\n",
 								    "cleaning_date(df1_campaign_stats_clean, 'opened_at')\n",
 								    "cleaning_date(df1_campaign_stats_clean, 'sent_at')\n",
 								    "cleaning_date(df1_campaign_stats_clean, 'delivered_at')\n",
 								    "\n",
 								    "# campaigns cleaning\n",
 								    "df1_campaigns_clean = df1_campaigns[[\"id\", \"name\", \"service_id\", \"sent_at\"]].add_prefix(\"campaign_\")\n",
 								    "cleaning_date(df1_campaigns_clean, 'campaign_sent_at')\n",
 								    "\n",
 								    "# Merge \n",
 								    "df1_campaigns_full = pd.merge(df1_campaign_stats_clean, df1_campaigns_clean, on = \"campaign_id\", how = \"left\")\n",
 								    "df1_campaigns_full.drop(['campaign_id'], axis = 1, inplace=True)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 53,
 								   "id": "2a3de6a5",
 								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "df1_campaigns_full.info()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 56,
 								   "id": "3fc1f446",
 								   "metadata": {},
 								   "outputs": [],
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "source": [
 								    "df1_campaigns_information"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "20e69ee3",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {
 								    "jp-MarkdownHeadingCollapsed": true
 								   },
 								   "source": [
 								    "## Link area"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 37,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "d9cbdbce",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "source": [
 								    "df1_campaigns"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 38,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "c07459f0",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "outputs": [],
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "source": [
 								    "df1_link_stats"
 								   ]
 								  },
 								  {
 								   "cell_type": "markdown",
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "80ae4c42",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {},
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "source": [
 								    "## Exploration variables"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "execution_count": 7,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "b50b8f95",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n",
 								    "def suppliers_exploration(suppliers = None) : \n",
 								    "    \n",
 								    "    # Taux de NaN pour ces colonnes\n",
 								    "    label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n",
 								    "    itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n",
 								    "    commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n",
 								    "\n",
 								    "    suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n",
 								    "                                  'label_na' : [label_na],\n",
 								    "                                  'itr_na' : [itr_na],\n",
 								    "                                  'commission_na' : [commission_na]})\n",
 								    "\n",
 								    "    return suppliers_desc"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "execution_count": 8,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "7e292935",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "execution_count": 9,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "05b6f2b0",
-												Ajout partie campaigns et target

											
										
										
											2024-02-04 16:02:01 +01:00
+								   "metadata": {},
 								   "outputs": [
 								    {
 								     "data": {
 								      "text/html": [
 								       "<div>\n",
 								       "<style scoped>\n",
 								       "    .dataframe tbody tr th:only-of-type {\n",
 								       "        vertical-align: middle;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe tbody tr th {\n",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								       "        vertical-align: top;\n",
 								       "    }\n",
 								       "\n",
 								       "    .dataframe thead th {\n",
 								       "        text-align: right;\n",
 								       "    }\n",
 								       "</style>\n",
 								       "<table border=\"1\" class=\"dataframe\">\n",
 								       "  <thead>\n",
 								       "    <tr style=\"text-align: right;\">\n",
 								       "      <th></th>\n",
 								       "      <th>nb_suppliers</th>\n",
 								       "      <th>label_na</th>\n",
 								       "      <th>itr_na</th>\n",
 								       "      <th>commission_na</th>\n",
 								       "    </tr>\n",
 								       "  </thead>\n",
 								       "  <tbody>\n",
 								       "    <tr>\n",
 								       "      <th>0</th>\n",
 								       "      <td>9</td>\n",
 								       "      <td>100.0</td>\n",
 								       "      <td>100.0</td>\n",
 								       "      <td>100.0</td>\n",
 								       "    </tr>\n",
 								       "  </tbody>\n",
 								       "</table>\n",
 								       "</div>"
 								      ],
 								      "text/plain": [
 								       "   nb_suppliers  label_na  itr_na  commission_na\n",
 								       "0             9     100.0   100.0          100.0"
 								      ]
 								     },
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								     "execution_count": 9,
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
 								   "source": [
 								    "df1_suppliers_desc"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "execution_count": 10,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "c9324d80",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "BUCKET = \"bdc2324-data\"\n",
 								    "liste_folders = fs.ls(BUCKET)\n",
 								    "\n",
 								    "liste_files = []\n",
 								    "for company_folder in liste_folders : \n",
 								    "    liste_files.extend(fs.ls(company_folder))"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "execution_count": 11,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "10304058",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "metadata": {},
 								   "outputs": [
 								    {
 								     "name": "stdout",
 								     "output_type": "stream",
 								     "text": [
 								      "['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n"
 								     ]
 								    }
 								   ],
 								   "source": [
 								    "liste_database_select = ['suppliers']\n",
 								    "\n",
 								    "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
 								    "liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n",
 								    "\n",
 								    "# Afficher le résultat\n",
 								    "print(liste_suppliers)"
 								   ]
 								  },
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								  {
 								   "cell_type": "code",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "execution_count": 32,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "ffa423e5",
-												Exploration secteur billettique et fusion en une base

											
										
										
											2024-01-13 14:47:24 +01:00
+								   "metadata": {},
 								   "outputs": [],
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								   "source": [
 								    "# loop to create dataframes from file 2\n",
 								    "def database_loading(database_name = None):\n",
 								    "    files_path = database_name\n",
 								    "    \n",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								    "    client_number = files_path.split(\"/\")[1]\n",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								    "    df_prefix = \"df\" + str(client_number) + \"_\"\n",
 								    "    \n",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								    "    current_path = files_path\n",
 								    "    with fs.open(current_path, mode=\"rb\") as file_in:\n",
 								    "        df = pd.read_csv(file_in)\n",
 								    "\n",
 								    "    return df, client_number"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": null,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "70bdc88d",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": []
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 45,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "6a0f567d",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "df_all = pd.DataFrame()\n",
 								    "\n",
 								    "for link in liste_suppliers:\n",
 								    "    \n",
 								    "    df_supplier, tenant_id = database_loading(link)\n",
 								    "    \n",
 								    "    df_supplier['tenant_id'] = int(tenant_id)\n",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								    "\n",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								    "    df_all = pd.concat([df_all, df_supplier], axis = 0)\n",
-												Ajout de fonctions pour nettoyage et exploration

											
										
										
											2024-01-14 17:38:16 +01:00
+								    "    "
 								   ]
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 63,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "1522d8cd",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "# df_all[df_all['tenant_id'] == 101]['name'].unique()"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 66,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "b0e42a61",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {},
 								   "outputs": [],
 								   "source": [
 								    "liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] \n",
 								    "# vad = vente à distance\n",
 								    "df_all['name'] = df_all['name'].fillna('')\n",
 								    "\n",
 								    "df_all['canal_vente_internet'] = df_all['name'].str.contains('|'.join(liste_mots), case=False).astype(int)\n"
 								   ]
 								  },
 								  {
 								   "cell_type": "code",
 								   "execution_count": 68,
-												Merge

											
										
										
											2024-02-10 22:46:56 +01:00
+								   "id": "d299ae91",
-												Ajout indicatrice canal vente en ligne

											
										
										
											2024-02-10 13:23:44 +01:00
+								   "metadata": {},
 								   "outputs": [
 								    {
 								     "data": {
 								      "text/plain": [
 								       "tenant_id\n",
 								       "1      1\n",
 								       "2      1\n",
 								       "3      1\n",
 								       "4      1\n",
 								       "5      1\n",
 								       "6      1\n",
 								       "7      1\n",
 								       "8      1\n",
 								       "9      1\n",
 								       "10     1\n",
 								       "11     1\n",
 								       "12     1\n",
 								       "13     1\n",
 								       "14     1\n",
 								       "101    1\n",
 								       "Name: canal_vente_internet, dtype: int64"
 								      ]
 								     },
 								     "execution_count": 68,
 								     "metadata": {},
 								     "output_type": "execute_result"
 								    }
 								   ],
 								   "source": [
 								    "df_all.groupby('tenant_id')['canal_vente_internet'].max()"
 								   ]
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								  }
 								 ],
 								 "metadata": {
 								  "kernelspec": {
 								   "display_name": "Python 3 (ipykernel)",
 								   "language": "python",
 								   "name": "python3"
 								  },
 								  "language_info": {
 								   "codemirror_mode": {
 								    "name": "ipython",
 								    "version": 3
 								   },
 								   "file_extension": ".py",
 								   "mimetype": "text/x-python",
 								   "name": "python",
 								   "nbconvert_exporter": "python",
 								   "pygments_lexer": "ipython3",
-												New structure

											
										
										
											2024-02-19 23:11:28 +01:00
+								   "version": "3.11.6"
-												revert ea384b3db445e90cb7679dee632f65e73bb94888

revert rename

											
										
										
											2024-01-10 19:19:51 +01:00
+								  }
 								 },
 								 "nbformat": 4,
 								 "nbformat_minor": 5
 								}