Exploration suppliers.csv

This commit is contained in:
Antoine JOUBREL 2024-01-13 09:38:10 +00:00
parent ce65bf37ff
commit d508eb0173

View File

@ -10,13 +10,16 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 12,
"id": "15103481-8d74-404c-aa09-7601fe7730da",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n"
"import numpy as np\n",
"import os\n",
"import s3fs\n",
"import re"
]
},
{
@ -29,16 +32,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 13,
"id": "5d83bb1a-d341-446e-91f6-1c428607f6d4",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import s3fs\n",
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n"
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})"
]
},
{
@ -46,69 +47,500 @@
"id": "f99da24f-0d93-4618-92bc-3ba81dc0445c",
"metadata": {},
"source": [
"## Exemple sur bdc2324-data/11"
"# Exemple sur Company 1"
]
},
{
"cell_type": "markdown",
"id": "9d74b68f-ba07-4a15-9a27-dae931762d70",
"metadata": {},
"source": [
"## Chargement données"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 14,
"id": "699664b9-eee4-4f8d-a207-e524526560c5",
"metadata": {},
"outputs": [],
"source": [
"BUCKET = \"bdc2324-data/1\"\n",
"liste_database = fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "0cb92854-903b-4efd-ac1b-197e29f044b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['bdc2324-data/1/1campaign_stats.csv', 'bdc2324-data/1/1campaigns.csv', 'bdc2324-data/1/1customer_target_mappings.csv', 'bdc2324-data/1/1customersplus.csv', 'bdc2324-data/1/1event_types.csv', 'bdc2324-data/1/1events.csv', 'bdc2324-data/1/1product_packs.csv', 'bdc2324-data/1/1products.csv', 'bdc2324-data/1/1products_groups.csv', 'bdc2324-data/1/1purchases.csv', 'bdc2324-data/1/1suppliers.csv', 'bdc2324-data/1/1target_types.csv', 'bdc2324-data/1/1targets.csv', 'bdc2324-data/1/1tickets.csv']\n"
]
}
],
"source": [
"liste_database_select = ['suppliers', 'ticket', 'purchase', 'consumption', 'customer', 'event', 'target', 'prod', 'campa']\n",
"\n",
"# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n",
"liste_database_filtered = [element for element in liste_database if any(element_part in element for element_part in liste_database_select)]\n",
"\n",
"# Afficher le résultat\n",
"print(liste_database_filtered)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "dd6a3518-b752-4a1e-b77b-9e03e853c3ed",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_4561/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(file_in)\n"
]
}
],
"source": [
"# loop to create dataframes from file 2\n",
"files_path = liste_database_filtered\n",
"\n",
"client_number = files_path[0].split(\"/\")[1]\n",
"df_prefix = \"df\" + str(client_number) + \"_\"\n",
"\n",
"for i in range(len(files_path)) :\n",
" current_path = files_path[i]\n",
" with fs.open(current_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in)\n",
" # the pattern of the name is df1xxx\n",
" nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
" globals()[nom_dataframe] = df"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "7d1da9df-f423-4a9f-a2a6-6d8ceeab1c34",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"_\n",
"__\n",
"___\n",
"df\n",
"df1_purchases\n",
"df1_suppliers\n",
"df1_tickets\n",
"dataframe\n",
"_7\n",
"_10\n",
"_11\n",
"_18\n",
"_20\n",
"df1_customer_target_mappings\n",
"df1_customersplus\n",
"df1_event_types\n",
"df1_events\n",
"df1_target_types\n",
"df1_targets\n"
]
}
],
"source": [
"# Obtenir toutes les variables globales\n",
"variables_globales = globals()\n",
"\n",
"# Filtrer les variables pour obtenir uniquement les DataFrames\n",
"dataframes = {nom: variable for nom, variable in variables_globales.items() if isinstance(variable, pd.DataFrame)}\n",
"\n",
"# Afficher les noms et les DataFrames\n",
"for nom, dataframe in dataframes.items():\n",
" print(f\"{nom}\")"
]
},
{
"cell_type": "markdown",
"id": "78453f3c-4f89-44ed-a6c6-2a7443b72b52",
"metadata": {},
"source": [
"## suppliers.csv"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "2e0dada0-9457-484c-aa55-77e44613ecca",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>name</th>\n",
" <th>manually_added</th>\n",
" <th>label</th>\n",
" <th>itr</th>\n",
" <th>updated_at</th>\n",
" <th>created_at</th>\n",
" <th>commission</th>\n",
" <th>identifier</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1617</td>\n",
" <td>j4 administration</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2021-07-29 09:21:37.325772+02:00</td>\n",
" <td>2021-07-29 09:21:37.325772+02:00</td>\n",
" <td>NaN</td>\n",
" <td>5958b2a060ac3e31678b438892a1bd2e</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>8</td>\n",
" <td>non défini</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:16:35.329062+02:00</td>\n",
" <td>2020-09-03 13:16:35.329062+02:00</td>\n",
" <td>NaN</td>\n",
" <td>52ff3466787b4d538407372e5f7afe0f</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>4</td>\n",
" <td>vad</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.896992+02:00</td>\n",
" <td>2020-09-03 13:11:23.896992+02:00</td>\n",
" <td>NaN</td>\n",
" <td>1225483c97b36018cab2bea14ab78ea6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>fort saint jean</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.833073+02:00</td>\n",
" <td>2020-09-03 13:11:23.833073+02:00</td>\n",
" <td>NaN</td>\n",
" <td>001b9b4a524fe407150b8235b304d4ec</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>j4</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.888993+02:00</td>\n",
" <td>2020-09-03 13:11:23.888993+02:00</td>\n",
" <td>NaN</td>\n",
" <td>6a0cf6edf20060344b465706b61719aa</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>5</td>\n",
" <td>revendeur</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.900987+02:00</td>\n",
" <td>2020-09-03 13:11:23.900987+02:00</td>\n",
" <td>NaN</td>\n",
" <td>931239d4acb6214d7e5c98edecfb4916</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>3</td>\n",
" <td>vente en ligne</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.893097+02:00</td>\n",
" <td>2020-09-03 13:11:23.893097+02:00</td>\n",
" <td>NaN</td>\n",
" <td>bde8f2ccff510df8572d3214d86b837d</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>6</td>\n",
" <td>ccr</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.904974+02:00</td>\n",
" <td>2020-09-03 13:11:23.904974+02:00</td>\n",
" <td>NaN</td>\n",
" <td>b48ec279411f7dbbb68393c61a9724d9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>7</td>\n",
" <td>dab</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-03 13:11:23.908970+02:00</td>\n",
" <td>2020-09-03 13:11:23.908970+02:00</td>\n",
" <td>NaN</td>\n",
" <td>11c6d471fa4e354e62e684d293694202</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"['bdc2324-data/11/11campaign_stats.csv',\n",
" 'bdc2324-data/11/11campaigns.csv',\n",
" 'bdc2324-data/11/11categories.csv',\n",
" 'bdc2324-data/11/11countries.csv',\n",
" 'bdc2324-data/11/11currencies.csv',\n",
" 'bdc2324-data/11/11customer_target_mappings.csv',\n",
" 'bdc2324-data/11/11customersplus.csv',\n",
" 'bdc2324-data/11/11event_types.csv',\n",
" 'bdc2324-data/11/11events.csv',\n",
" 'bdc2324-data/11/11facilities.csv',\n",
" 'bdc2324-data/11/11link_stats.csv',\n",
" 'bdc2324-data/11/11pricing_formulas.csv',\n",
" 'bdc2324-data/11/11product_packs.csv',\n",
" 'bdc2324-data/11/11products.csv',\n",
" 'bdc2324-data/11/11products_groups.csv',\n",
" 'bdc2324-data/11/11purchases.csv',\n",
" 'bdc2324-data/11/11representation_category_capacities.csv',\n",
" 'bdc2324-data/11/11representations.csv',\n",
" 'bdc2324-data/11/11seasons.csv',\n",
" 'bdc2324-data/11/11structure_tag_mappings.csv',\n",
" 'bdc2324-data/11/11suppliers.csv',\n",
" 'bdc2324-data/11/11tags.csv',\n",
" 'bdc2324-data/11/11target_types.csv',\n",
" 'bdc2324-data/11/11targets.csv',\n",
" 'bdc2324-data/11/11tickets.csv']"
" id name manually_added label itr \\\n",
"0 1617 j4 administration False NaN NaN \n",
"1 8 non défini False NaN NaN \n",
"2 4 vad False NaN NaN \n",
"3 1 fort saint jean False NaN NaN \n",
"4 2 j4 False NaN NaN \n",
"5 5 revendeur False NaN NaN \n",
"6 3 vente en ligne False NaN NaN \n",
"7 6 ccr False NaN NaN \n",
"8 7 dab False NaN NaN \n",
"\n",
" updated_at created_at \\\n",
"0 2021-07-29 09:21:37.325772+02:00 2021-07-29 09:21:37.325772+02:00 \n",
"1 2020-09-03 13:16:35.329062+02:00 2020-09-03 13:16:35.329062+02:00 \n",
"2 2020-09-03 13:11:23.896992+02:00 2020-09-03 13:11:23.896992+02:00 \n",
"3 2020-09-03 13:11:23.833073+02:00 2020-09-03 13:11:23.833073+02:00 \n",
"4 2020-09-03 13:11:23.888993+02:00 2020-09-03 13:11:23.888993+02:00 \n",
"5 2020-09-03 13:11:23.900987+02:00 2020-09-03 13:11:23.900987+02:00 \n",
"6 2020-09-03 13:11:23.893097+02:00 2020-09-03 13:11:23.893097+02:00 \n",
"7 2020-09-03 13:11:23.904974+02:00 2020-09-03 13:11:23.904974+02:00 \n",
"8 2020-09-03 13:11:23.908970+02:00 2020-09-03 13:11:23.908970+02:00 \n",
"\n",
" commission identifier \n",
"0 NaN 5958b2a060ac3e31678b438892a1bd2e \n",
"1 NaN 52ff3466787b4d538407372e5f7afe0f \n",
"2 NaN 1225483c97b36018cab2bea14ab78ea6 \n",
"3 NaN 001b9b4a524fe407150b8235b304d4ec \n",
"4 NaN 6a0cf6edf20060344b465706b61719aa \n",
"5 NaN 931239d4acb6214d7e5c98edecfb4916 \n",
"6 NaN bde8f2ccff510df8572d3214d86b837d \n",
"7 NaN b48ec279411f7dbbb68393c61a9724d9 \n",
"8 NaN 11c6d471fa4e354e62e684d293694202 "
]
},
"execution_count": 10,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"BUCKET = \"bdc2324-data/11\"\n",
"fs.ls(BUCKET)"
"# Restriction aux DataFrame : ticket, purchase, consumption, suppliers\n",
"df1_suppliers"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "6d6201cd-a00b-4984-bcd8-72838717ad13",
"execution_count": 19,
"id": "b583be02-ab60-4e14-9325-0204f203a1af",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 9 entries, 0 to 8\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 9 non-null int64 \n",
" 1 name 9 non-null object \n",
" 2 manually_added 9 non-null bool \n",
" 3 label 0 non-null float64\n",
" 4 itr 0 non-null float64\n",
" 5 updated_at 9 non-null object \n",
" 6 created_at 9 non-null object \n",
" 7 commission 0 non-null float64\n",
" 8 identifier 9 non-null object \n",
"dtypes: bool(1), float64(3), int64(1), object(4)\n",
"memory usage: 713.0+ bytes\n"
]
}
],
"source": [
"df1_suppliers.info()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "6d7f338e-e4d3-422b-9cdc-dec967c0b28e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>id</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>name</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>manually_added</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>label</th>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>itr</th>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>updated_at</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>created_at</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>commission</th>\n",
" <td>100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>identifier</th>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0\n",
"id 0.0\n",
"name 0.0\n",
"manually_added 0.0\n",
"label 100.0\n",
"itr 100.0\n",
"updated_at 0.0\n",
"created_at 0.0\n",
"commission 100.0\n",
"identifier 0.0"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(df1_suppliers.isna().mean()*100)"
]
},
{
"cell_type": "markdown",
"id": "676a9869-9a8b-4cd2-8b1c-0644b5229c72",
"metadata": {},
"source": [
"## purchases.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "14f4158e-c9c0-4beb-826a-5e0f949434a4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "79c9eb43-002e-460d-acb2-206ebb2ab6dd",
"metadata": {},
"source": [
"## tickets.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f3c35394-b586-4ae4-b5ab-b03bb01bb618",
"metadata": {},
"outputs": [],
"source": [
"# Chargement de toutes les données\n",
"liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n",
"\n",
"for nom_base in liste_base:\n",
" FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n",
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
"df1_purchases\n",
"df1_tickets"
]
},
{
"cell_type": "markdown",
"id": "355f5489-7904-4161-a85b-6eb70b3a4c89",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"# Fusion et exploration"
]
},
{