diff --git a/Clean-Notebook.ipynb b/Clean-Notebook.ipynb
index 04a7d1b..1f70494 100644
--- a/Clean-Notebook.ipynb
+++ b/Clean-Notebook.ipynb
@@ -10,13 +10,16 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 12,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
- "import numpy as np\n"
+ "import numpy as np\n",
+ "import os\n",
+ "import s3fs\n",
+ "import re"
]
},
{
@@ -29,16 +32,14 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 13,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
"source": [
- "import os\n",
- "import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
- "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n"
+ "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
@@ -46,69 +47,500 @@
"id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
"metadata": {},
"source": [
- "## Exemple sur bdc2324-data/11"
+ "# Exemple sur Company 1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9d74b68f-ba07-4a15-9a27-dae931762d70",
+ "metadata": {},
+ "source": [
+ "## Chargement données"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 14,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
+ "outputs": [],
+ "source": [
+ "BUCKET = \"bdc2324-data/1\"\n",
+ "liste_database = fs.ls(BUCKET)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "0cb92854-903b-4efd-ac1b-197e29f044b4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['bdc2324-data/1/1campaign_stats.csv', 'bdc2324-data/1/1campaigns.csv', 'bdc2324-data/1/1customer_target_mappings.csv', 'bdc2324-data/1/1customersplus.csv', 'bdc2324-data/1/1event_types.csv', 'bdc2324-data/1/1events.csv', 'bdc2324-data/1/1product_packs.csv', 'bdc2324-data/1/1products.csv', 'bdc2324-data/1/1products_groups.csv', 'bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1target_types.csv', 'bdc2324-data/1/1targets.csv', 'bdc2324-data/1/1tickets.csv']\n"
+ ]
+ }
+ ],
+ "source": [
+ "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'customer', 'event', 'target', 'prod', 'campa']\n",
+ "\n",
+ "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
+ "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
+ "\n",
+ "# Afficher le résultat\n",
+ "print(liste_database_filtered)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_4561/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " df = pd.read_csv(file_in)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# loop to create dataframes from file 2\n",
+ "files_path = liste_database_filtered\n",
+ "\n",
+ "client_number = files_path[0].split(\"/\")[1]\n",
+ "df_prefix = \"df\" + str(client_number) + \"_\"\n",
+ "\n",
+ "for i in range(len(files_path)) :\n",
+ " current_path = files_path[i]\n",
+ " with fs.open(current_path, mode=\"rb\") as file_in:\n",
+ " df = pd.read_csv(file_in)\n",
+ " # the pattern of the name is df1xxx\n",
+ " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
+ " globals()[nom_dataframe] = df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "7d1da9df-f423-4a9f-a2a6-6d8ceeab1c34",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "_\n",
+ "__\n",
+ "___\n",
+ "df\n",
+ "df1_purchases\n",
+ "df1_suppliers\n",
+ "df1_tickets\n",
+ "dataframe\n",
+ "_7\n",
+ "_10\n",
+ "_11\n",
+ "_18\n",
+ "_20\n",
+ "df1_customer_target_mappings\n",
+ "df1_customersplus\n",
+ "df1_event_types\n",
+ "df1_events\n",
+ "df1_target_types\n",
+ "df1_targets\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Obtenir toutes les variables globales\n",
+ "variables_globales = globals()\n",
+ "\n",
+ "# Filtrer les variables pour obtenir uniquement les DataFrames\n",
+ "dataframes = {nom: variable for nom, variable in variables_globales.items() if isinstance(variable, pd.DataFrame)}\n",
+ "\n",
+ "# Afficher les noms et les DataFrames\n",
+ "for nom, dataframe in dataframes.items():\n",
+ " print(f\"{nom}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52",
+ "metadata": {},
+ "source": [
+ "## suppliers.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "2e0dada0-9457-484c-aa55-77e44613ecca",
+ "metadata": {},
"outputs": [
{
"data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " manually_added | \n",
+ " label | \n",
+ " itr | \n",
+ " updated_at | \n",
+ " created_at | \n",
+ " commission | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1617 | \n",
+ " j4 administration | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2021-07-29 09:21:37.325772+02:00 | \n",
+ " 2021-07-29 09:21:37.325772+02:00 | \n",
+ " NaN | \n",
+ " 5958b2a060ac3e31678b438892a1bd2e | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 8 | \n",
+ " non défini | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:16:35.329062+02:00 | \n",
+ " 2020-09-03 13:16:35.329062+02:00 | \n",
+ " NaN | \n",
+ " 52ff3466787b4d538407372e5f7afe0f | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4 | \n",
+ " vad | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.896992+02:00 | \n",
+ " 2020-09-03 13:11:23.896992+02:00 | \n",
+ " NaN | \n",
+ " 1225483c97b36018cab2bea14ab78ea6 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " fort saint jean | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.833073+02:00 | \n",
+ " 2020-09-03 13:11:23.833073+02:00 | \n",
+ " NaN | \n",
+ " 001b9b4a524fe407150b8235b304d4ec | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2 | \n",
+ " j4 | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.888993+02:00 | \n",
+ " 2020-09-03 13:11:23.888993+02:00 | \n",
+ " NaN | \n",
+ " 6a0cf6edf20060344b465706b61719aa | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 5 | \n",
+ " revendeur | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.900987+02:00 | \n",
+ " 2020-09-03 13:11:23.900987+02:00 | \n",
+ " NaN | \n",
+ " 931239d4acb6214d7e5c98edecfb4916 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 3 | \n",
+ " vente en ligne | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.893097+02:00 | \n",
+ " 2020-09-03 13:11:23.893097+02:00 | \n",
+ " NaN | \n",
+ " bde8f2ccff510df8572d3214d86b837d | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 6 | \n",
+ " ccr | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.904974+02:00 | \n",
+ " 2020-09-03 13:11:23.904974+02:00 | \n",
+ " NaN | \n",
+ " b48ec279411f7dbbb68393c61a9724d9 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 7 | \n",
+ " dab | \n",
+ " False | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2020-09-03 13:11:23.908970+02:00 | \n",
+ " 2020-09-03 13:11:23.908970+02:00 | \n",
+ " NaN | \n",
+ " 11c6d471fa4e354e62e684d293694202 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
"text/plain": [
- "['bdc2324-data/11/11campaign_stats.csv',\n",
- " 'bdc2324-data/11/11campaigns.csv',\n",
- " 'bdc2324-data/11/11categories.csv',\n",
- " 'bdc2324-data/11/11countries.csv',\n",
- " 'bdc2324-data/11/11currencies.csv',\n",
- " 'bdc2324-data/11/11customer_target_mappings.csv',\n",
- " 'bdc2324-data/11/11customersplus.csv',\n",
- " 'bdc2324-data/11/11event_types.csv',\n",
- " 'bdc2324-data/11/11events.csv',\n",
- " 'bdc2324-data/11/11facilities.csv',\n",
- " 'bdc2324-data/11/11link_stats.csv',\n",
- " 'bdc2324-data/11/11pricing_formulas.csv',\n",
- " 'bdc2324-data/11/11product_packs.csv',\n",
- " 'bdc2324-data/11/11products.csv',\n",
- " 'bdc2324-data/11/11products_groups.csv',\n",
- " 'bdc2324-data/11/11purchases.csv',\n",
- " 'bdc2324-data/11/11representation_category_capacities.csv',\n",
- " 'bdc2324-data/11/11representations.csv',\n",
- " 'bdc2324-data/11/11seasons.csv',\n",
- " 'bdc2324-data/11/11structure_tag_mappings.csv',\n",
- " 'bdc2324-data/11/11suppliers.csv',\n",
- " 'bdc2324-data/11/11tags.csv',\n",
- " 'bdc2324-data/11/11target_types.csv',\n",
- " 'bdc2324-data/11/11targets.csv',\n",
- " 'bdc2324-data/11/11tickets.csv']"
+ " id name manually_added label itr \\\n",
+ "0 1617 j4 administration False NaN NaN \n",
+ "1 8 non défini False NaN NaN \n",
+ "2 4 vad False NaN NaN \n",
+ "3 1 fort saint jean False NaN NaN \n",
+ "4 2 j4 False NaN NaN \n",
+ "5 5 revendeur False NaN NaN \n",
+ "6 3 vente en ligne False NaN NaN \n",
+ "7 6 ccr False NaN NaN \n",
+ "8 7 dab False NaN NaN \n",
+ "\n",
+ " updated_at created_at \\\n",
+ "0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n",
+ "1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n",
+ "2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n",
+ "3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n",
+ "4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n",
+ "5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n",
+ "6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n",
+ "7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n",
+ "8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n",
+ "\n",
+ " commission identifier \n",
+ "0 NaN 5958b2a060ac3e31678b438892a1bd2e \n",
+ "1 NaN 52ff3466787b4d538407372e5f7afe0f \n",
+ "2 NaN 1225483c97b36018cab2bea14ab78ea6 \n",
+ "3 NaN 001b9b4a524fe407150b8235b304d4ec \n",
+ "4 NaN 6a0cf6edf20060344b465706b61719aa \n",
+ "5 NaN 931239d4acb6214d7e5c98edecfb4916 \n",
+ "6 NaN bde8f2ccff510df8572d3214d86b837d \n",
+ "7 NaN b48ec279411f7dbbb68393c61a9724d9 \n",
+ "8 NaN 11c6d471fa4e354e62e684d293694202 "
]
},
- "execution_count": 10,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "BUCKET = \"bdc2324-data/11\"\n",
- "fs.ls(BUCKET)"
+ "# Restriction aux DataFrame : ticket, purchase, consumption, suppliers\n",
+ "df1_suppliers"
]
},
{
"cell_type": "code",
- "execution_count": 23,
- "id": "6d6201cd-a00b-4984-bcd8-72838717ad13",
+ "execution_count": 19,
+ "id": "b583be02-ab60-4e14-9325-0204f203a1af",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 9 entries, 0 to 8\n",
+ "Data columns (total 9 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 9 non-null int64 \n",
+ " 1 name 9 non-null object \n",
+ " 2 manually_added 9 non-null bool \n",
+ " 3 label 0 non-null float64\n",
+ " 4 itr 0 non-null float64\n",
+ " 5 updated_at 9 non-null object \n",
+ " 6 created_at 9 non-null object \n",
+ " 7 commission 0 non-null float64\n",
+ " 8 identifier 9 non-null object \n",
+ "dtypes: bool(1), float64(3), int64(1), object(4)\n",
+ "memory usage: 713.0+ bytes\n"
+ ]
+ }
+ ],
+ "source": [
+ "df1_suppliers.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " id | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " name | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " manually_added | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " label | \n",
+ " 100.0 | \n",
+ "
\n",
+ " \n",
+ " itr | \n",
+ " 100.0 | \n",
+ "
\n",
+ " \n",
+ " updated_at | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " created_at | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " commission | \n",
+ " 100.0 | \n",
+ "
\n",
+ " \n",
+ " identifier | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0\n",
+ "id 0.0\n",
+ "name 0.0\n",
+ "manually_added 0.0\n",
+ "label 100.0\n",
+ "itr 100.0\n",
+ "updated_at 0.0\n",
+ "created_at 0.0\n",
+ "commission 100.0\n",
+ "identifier 0.0"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.DataFrame(df1_suppliers.isna().mean()*100)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72",
+ "metadata": {},
+ "source": [
+ "## purchases.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "id": "79c9eb43-002e-460d-acb2-206ebb2ab6dd",
+ "metadata": {},
+ "source": [
+ "## tickets.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618",
"metadata": {},
"outputs": [],
"source": [
- "# Chargement de toutes les données\n",
- "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
- "\n",
- "for nom_base in liste_base:\n",
- " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
- " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
- " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
+ "df1_purchases\n",
+ "df1_tickets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "355f5489-7904-4161-a85b-6eb70b3a4c89",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "# Fusion et exploration"
]
},
{