diff --git a/Clean-Notebook.ipynb b/Clean-Notebook.ipynb index 04a7d1b..1f70494 100644 --- a/Clean-Notebook.ipynb +++ b/Clean-Notebook.ipynb @@ -10,13 +10,16 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "id": "15103481-8d74-404c-aa09-7601fe7730da", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", - "import numpy as np\n" + "import numpy as np\n", + "import os\n", + "import s3fs\n", + "import re" ] }, { @@ -29,16 +32,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 13, "id": "5d83bb1a-d341-446e-91f6-1c428607f6d4", "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import s3fs\n", "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", - "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n" + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" ] }, { @@ -46,69 +47,500 @@ "id": "f99da24f-0d93-4618-92bc-3ba81dc0445c", "metadata": {}, "source": [ - "## Exemple sur bdc2324-data/11" + "# Exemple sur Company 1" + ] + }, + { + "cell_type": "markdown", + "id": "9d74b68f-ba07-4a15-9a27-dae931762d70", + "metadata": {}, + "source": [ + "## Chargement données" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "id": "699664b9-eee4-4f8d-a207-e524526560c5", "metadata": {}, + "outputs": [], + "source": [ + "BUCKET = \"bdc2324-data/1\"\n", + "liste_database = fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "0cb92854-903b-4efd-ac1b-197e29f044b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['bdc2324-data/1/1campaign_stats.csv', 'bdc2324-data/1/1campaigns.csv', 'bdc2324-data/1/1customer_target_mappings.csv', 'bdc2324-data/1/1customersplus.csv', 'bdc2324-data/1/1event_types.csv', 'bdc2324-data/1/1events.csv', 'bdc2324-data/1/1product_packs.csv', 'bdc2324-data/1/1products.csv', 'bdc2324-data/1/1products_groups.csv', 'bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1target_types.csv', 'bdc2324-data/1/1targets.csv', 'bdc2324-data/1/1tickets.csv']\n" + ] + } + ], + "source": [ + "liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'customer', 'event', 'target', 'prod', 'campa']\n", + "\n", + "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", + "liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n", + "\n", + "# Afficher le résultat\n", + "print(liste_database_filtered)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_4561/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(file_in)\n" + ] + } + ], + "source": [ + "# loop to create dataframes from file 2\n", + "files_path = liste_database_filtered\n", + "\n", + "client_number = files_path[0].split(\"/\")[1]\n", + "df_prefix = \"df\" + str(client_number) + \"_\"\n", + "\n", + "for i in range(len(files_path)) :\n", + " current_path = files_path[i]\n", + " with fs.open(current_path, mode=\"rb\") as file_in:\n", + " df = pd.read_csv(file_in)\n", + " # the pattern of the name is df1xxx\n", + " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", + " globals()[nom_dataframe] = df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "7d1da9df-f423-4a9f-a2a6-6d8ceeab1c34", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "_\n", + "__\n", + "___\n", + "df\n", + "df1_purchases\n", + "df1_suppliers\n", + "df1_tickets\n", + "dataframe\n", + "_7\n", + "_10\n", + "_11\n", + "_18\n", + "_20\n", + "df1_customer_target_mappings\n", + "df1_customersplus\n", + "df1_event_types\n", + "df1_events\n", + "df1_target_types\n", + "df1_targets\n" + ] + } + ], + "source": [ + "# Obtenir toutes les variables globales\n", + "variables_globales = globals()\n", + "\n", + "# Filtrer les variables pour obtenir uniquement les DataFrames\n", + "dataframes = {nom: variable for nom, variable in variables_globales.items() if isinstance(variable, pd.DataFrame)}\n", + "\n", + "# Afficher les noms et les DataFrames\n", + "for nom, dataframe in dataframes.items():\n", + " print(f\"{nom}\")" + ] + }, + { + "cell_type": "markdown", + "id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52", + "metadata": {}, + "source": [ + "## suppliers.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2e0dada0-9457-484c-aa55-77e44613ecca", + "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamemanually_addedlabelitrupdated_atcreated_atcommissionidentifier
01617j4 administrationFalseNaNNaN2021-07-29 09:21:37.325772+02:002021-07-29 09:21:37.325772+02:00NaN5958b2a060ac3e31678b438892a1bd2e
18non définiFalseNaNNaN2020-09-03 13:16:35.329062+02:002020-09-03 13:16:35.329062+02:00NaN52ff3466787b4d538407372e5f7afe0f
24vadFalseNaNNaN2020-09-03 13:11:23.896992+02:002020-09-03 13:11:23.896992+02:00NaN1225483c97b36018cab2bea14ab78ea6
31fort saint jeanFalseNaNNaN2020-09-03 13:11:23.833073+02:002020-09-03 13:11:23.833073+02:00NaN001b9b4a524fe407150b8235b304d4ec
42j4FalseNaNNaN2020-09-03 13:11:23.888993+02:002020-09-03 13:11:23.888993+02:00NaN6a0cf6edf20060344b465706b61719aa
55revendeurFalseNaNNaN2020-09-03 13:11:23.900987+02:002020-09-03 13:11:23.900987+02:00NaN931239d4acb6214d7e5c98edecfb4916
63vente en ligneFalseNaNNaN2020-09-03 13:11:23.893097+02:002020-09-03 13:11:23.893097+02:00NaNbde8f2ccff510df8572d3214d86b837d
76ccrFalseNaNNaN2020-09-03 13:11:23.904974+02:002020-09-03 13:11:23.904974+02:00NaNb48ec279411f7dbbb68393c61a9724d9
87dabFalseNaNNaN2020-09-03 13:11:23.908970+02:002020-09-03 13:11:23.908970+02:00NaN11c6d471fa4e354e62e684d293694202
\n", + "
" + ], "text/plain": [ - "['bdc2324-data/11/11campaign_stats.csv',\n", - " 'bdc2324-data/11/11campaigns.csv',\n", - " 'bdc2324-data/11/11categories.csv',\n", - " 'bdc2324-data/11/11countries.csv',\n", - " 'bdc2324-data/11/11currencies.csv',\n", - " 'bdc2324-data/11/11customer_target_mappings.csv',\n", - " 'bdc2324-data/11/11customersplus.csv',\n", - " 'bdc2324-data/11/11event_types.csv',\n", - " 'bdc2324-data/11/11events.csv',\n", - " 'bdc2324-data/11/11facilities.csv',\n", - " 'bdc2324-data/11/11link_stats.csv',\n", - " 'bdc2324-data/11/11pricing_formulas.csv',\n", - " 'bdc2324-data/11/11product_packs.csv',\n", - " 'bdc2324-data/11/11products.csv',\n", - " 'bdc2324-data/11/11products_groups.csv',\n", - " 'bdc2324-data/11/11purchases.csv',\n", - " 'bdc2324-data/11/11representation_category_capacities.csv',\n", - " 'bdc2324-data/11/11representations.csv',\n", - " 'bdc2324-data/11/11seasons.csv',\n", - " 'bdc2324-data/11/11structure_tag_mappings.csv',\n", - " 'bdc2324-data/11/11suppliers.csv',\n", - " 'bdc2324-data/11/11tags.csv',\n", - " 'bdc2324-data/11/11target_types.csv',\n", - " 'bdc2324-data/11/11targets.csv',\n", - " 'bdc2324-data/11/11tickets.csv']" + " id name manually_added label itr \\\n", + "0 1617 j4 administration False NaN NaN \n", + "1 8 non défini False NaN NaN \n", + "2 4 vad False NaN NaN \n", + "3 1 fort saint jean False NaN NaN \n", + "4 2 j4 False NaN NaN \n", + "5 5 revendeur False NaN NaN \n", + "6 3 vente en ligne False NaN NaN \n", + "7 6 ccr False NaN NaN \n", + "8 7 dab False NaN NaN \n", + "\n", + " updated_at created_at \\\n", + "0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n", + "1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n", + "2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n", + "3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n", + "4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n", + "5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n", + "6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n", + "7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n", + "8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n", + "\n", + " commission identifier \n", + "0 NaN 5958b2a060ac3e31678b438892a1bd2e \n", + "1 NaN 52ff3466787b4d538407372e5f7afe0f \n", + "2 NaN 1225483c97b36018cab2bea14ab78ea6 \n", + "3 NaN 001b9b4a524fe407150b8235b304d4ec \n", + "4 NaN 6a0cf6edf20060344b465706b61719aa \n", + "5 NaN 931239d4acb6214d7e5c98edecfb4916 \n", + "6 NaN bde8f2ccff510df8572d3214d86b837d \n", + "7 NaN b48ec279411f7dbbb68393c61a9724d9 \n", + "8 NaN 11c6d471fa4e354e62e684d293694202 " ] }, - "execution_count": 10, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "BUCKET = \"bdc2324-data/11\"\n", - "fs.ls(BUCKET)" + "# Restriction aux DataFrame : ticket, purchase, consumption, suppliers\n", + "df1_suppliers" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "6d6201cd-a00b-4984-bcd8-72838717ad13", + "execution_count": 19, + "id": "b583be02-ab60-4e14-9325-0204f203a1af", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 9 entries, 0 to 8\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 9 non-null int64 \n", + " 1 name 9 non-null object \n", + " 2 manually_added 9 non-null bool \n", + " 3 label 0 non-null float64\n", + " 4 itr 0 non-null float64\n", + " 5 updated_at 9 non-null object \n", + " 6 created_at 9 non-null object \n", + " 7 commission 0 non-null float64\n", + " 8 identifier 9 non-null object \n", + "dtypes: bool(1), float64(3), int64(1), object(4)\n", + "memory usage: 713.0+ bytes\n" + ] + } + ], + "source": [ + "df1_suppliers.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
id0.0
name0.0
manually_added0.0
label100.0
itr100.0
updated_at0.0
created_at0.0
commission100.0
identifier0.0
\n", + "
" + ], + "text/plain": [ + " 0\n", + "id 0.0\n", + "name 0.0\n", + "manually_added 0.0\n", + "label 100.0\n", + "itr 100.0\n", + "updated_at 0.0\n", + "created_at 0.0\n", + "commission 100.0\n", + "identifier 0.0" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(df1_suppliers.isna().mean()*100)" + ] + }, + { + "cell_type": "markdown", + "id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72", + "metadata": {}, + "source": [ + "## purchases.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14f4158e-c9c0-4beb-826a-5e0f949434a4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "79c9eb43-002e-460d-acb2-206ebb2ab6dd", + "metadata": {}, + "source": [ + "## tickets.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618", "metadata": {}, "outputs": [], "source": [ - "# Chargement de toutes les données\n", - "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n", - "\n", - "for nom_base in liste_base:\n", - " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n", - " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", - " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" + "df1_purchases\n", + "df1_tickets" + ] + }, + { + "cell_type": "markdown", + "id": "355f5489-7904-4161-a85b-6eb70b3a4c89", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "# Fusion et exploration" ] }, {