diff --git a/TP_access_merge_data.ipynb b/TP_access_merge_data.ipynb
new file mode 100644
index 0000000..c8adbd5
--- /dev/null
+++ b/TP_access_merge_data.ipynb
@@ -0,0 +1,1215 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "5ce2ffc5-66b6-4709-9e2c-7a50f49d1361",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# test\n",
+ "\n",
+ "import os \n",
+ "import s3fs\n",
+ "import pandas as pd\n",
+ "import re"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "f579ff01-f009-4fb1-ba79-0cb3ce58ab7f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['bdc2324-data/1',\n",
+ " 'bdc2324-data/10',\n",
+ " 'bdc2324-data/101',\n",
+ " 'bdc2324-data/11',\n",
+ " 'bdc2324-data/12',\n",
+ " 'bdc2324-data/13',\n",
+ " 'bdc2324-data/14',\n",
+ " 'bdc2324-data/2',\n",
+ " 'bdc2324-data/3',\n",
+ " 'bdc2324-data/4',\n",
+ " 'bdc2324-data/5',\n",
+ " 'bdc2324-data/6',\n",
+ " 'bdc2324-data/7',\n",
+ " 'bdc2324-data/8',\n",
+ " 'bdc2324-data/9']"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
+ "\n",
+ "fs = s3fs.S3FileSystem(client_kwargs = {\"endpoint_url\" : S3_ENDPOINT_URL})\n",
+ "BUCKET = \"bdc2324-data\"\n",
+ "fs.ls(BUCKET)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "c8b2c797-271f-43ee-8823-d0aee5b8782d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "FILE_PATH_S3 = fs.ls(BUCKET)[1] # +\".csv\"\n",
+ "files_path_2 = fs.ls(FILE_PATH_S3)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "18cee687-1462-4169-9bfe-f39786135cdd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with fs.open(files_path_1[1], mode=\"rb\") as file_in:\n",
+ " # print(file_in)\n",
+ " df_campaigns = pd.read_csv(file_in)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "33e8d14c-c649-4b9c-8290-4a2aa635f999",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " name | \n",
+ " service_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " process_id | \n",
+ " report_url | \n",
+ " category | \n",
+ " to_be_synced | \n",
+ " identifier | \n",
+ " sent_at | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1319613 | \n",
+ " newsletter enseignants janvier 2022 | \n",
+ " 721 | \n",
+ " 2022-01-14 16:06:42.586321+01:00 | \n",
+ " 2022-02-03 14:17:27.112963+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " aba3b6fd5d186d28e06ff97135cade7f | \n",
+ " 2022-01-14 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1319586 | \n",
+ " lsf_janvier_2022 | \n",
+ " 717 | \n",
+ " 2022-01-07 11:30:35.315895+01:00 | \n",
+ " 2022-02-03 14:17:27.116171+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 788d986905533aba051261497ecffcbb | \n",
+ " 2022-01-07 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1319282 | \n",
+ " Invitation à déjeuner au Mucem | Vernissage « ... | \n",
+ " 591 | \n",
+ " 2021-09-28 12:50:24.448752+02:00 | \n",
+ " 2022-02-03 14:17:27.119582+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 3493894fa4ea036cfc6433c3e2ee63b0 | \n",
+ " 2021-09-28 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1319283 | \n",
+ " Vacances de la Toussaint - centres des loisirs | \n",
+ " 590 | \n",
+ " 2021-09-28 18:01:04.692073+02:00 | \n",
+ " 2022-02-03 14:17:27.124408+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 08b255a5d42b89b0585260b6f2360bdd | \n",
+ " 2021-09-28 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1319636 | \n",
+ " ddcp_promo_md_livemag | \n",
+ " 730 | \n",
+ " 2022-01-27 18:00:41.053069+01:00 | \n",
+ " 2022-02-03 14:17:27.127607+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " d5cfead94f5350c12c322b5b664544c1 | \n",
+ " 2022-01-27 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 952 | \n",
+ " 1320072 | \n",
+ " dre_gaza0106 | \n",
+ " 881 | \n",
+ " 2022-05-26 09:01:35.523639+02:00 | \n",
+ " 2022-12-02 17:51:22.614046+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 7504adad8bb96320eb3afdd4df6e1f60 | \n",
+ " 2022-05-26 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 953 | \n",
+ " 661398 | \n",
+ " DDCP Plan Bis 4 - Marketing direct - MJ5C | \n",
+ " 183 | \n",
+ " 2021-06-18 10:30:01.259578+02:00 | \n",
+ " 2021-09-24 11:56:09.082785+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " cedebb6e872f539bef8c3f919874e9d7 | \n",
+ " 2020-07-27 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 954 | \n",
+ " 1320487 | \n",
+ " Invitation portes ouvertes amitiés | \n",
+ " 988 | \n",
+ " 2022-09-29 18:01:33.834090+02:00 | \n",
+ " 2022-12-02 17:51:23.258324+01:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 9908279ebbf1f9b250ba689db6a0222b | \n",
+ " 2022-09-29 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ " 955 | \n",
+ " 906903 | \n",
+ " DDCP PROMO La méditerranée des philosophes #3 ... | \n",
+ " 310 | \n",
+ " 2021-07-19 14:07:16.177390+02:00 | \n",
+ " 2021-09-24 11:56:09.086101+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 06eb61b839a0cefee4967c67ccb099dc | \n",
+ " 2020-12-23 00:00:00+01:00 | \n",
+ "
\n",
+ " \n",
+ " 956 | \n",
+ " 579313 | \n",
+ " ddcp_promo_automation_manuel_pre_visit | \n",
+ " 481 | \n",
+ " 2021-06-08 17:38:54.041310+02:00 | \n",
+ " 2021-09-24 11:56:09.089394+02:00 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " False | \n",
+ " 9461cce28ebe3e76fb4b931c35a169b0 | \n",
+ " 2021-06-08 00:00:00+02:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
957 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id name service_id \\\n",
+ "0 1319613 newsletter enseignants janvier 2022 721 \n",
+ "1 1319586 lsf_janvier_2022 717 \n",
+ "2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n",
+ "3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n",
+ "4 1319636 ddcp_promo_md_livemag 730 \n",
+ ".. ... ... ... \n",
+ "952 1320072 dre_gaza0106 881 \n",
+ "953 661398 DDCP Plan Bis 4 - Marketing direct - MJ5C 183 \n",
+ "954 1320487 Invitation portes ouvertes amitiés 988 \n",
+ "955 906903 DDCP PROMO La méditerranée des philosophes #3 ... 310 \n",
+ "956 579313 ddcp_promo_automation_manuel_pre_visit 481 \n",
+ "\n",
+ " created_at updated_at \\\n",
+ "0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n",
+ "1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n",
+ "2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n",
+ "3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n",
+ "4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n",
+ ".. ... ... \n",
+ "952 2022-05-26 09:01:35.523639+02:00 2022-12-02 17:51:22.614046+01:00 \n",
+ "953 2021-06-18 10:30:01.259578+02:00 2021-09-24 11:56:09.082785+02:00 \n",
+ "954 2022-09-29 18:01:33.834090+02:00 2022-12-02 17:51:23.258324+01:00 \n",
+ "955 2021-07-19 14:07:16.177390+02:00 2021-09-24 11:56:09.086101+02:00 \n",
+ "956 2021-06-08 17:38:54.041310+02:00 2021-09-24 11:56:09.089394+02:00 \n",
+ "\n",
+ " process_id report_url category to_be_synced \\\n",
+ "0 NaN NaN 0.0 False \n",
+ "1 NaN NaN 0.0 False \n",
+ "2 NaN NaN 0.0 False \n",
+ "3 NaN NaN 0.0 False \n",
+ "4 NaN NaN 0.0 False \n",
+ ".. ... ... ... ... \n",
+ "952 NaN NaN 0.0 False \n",
+ "953 NaN NaN 0.0 False \n",
+ "954 NaN NaN 0.0 False \n",
+ "955 NaN NaN 0.0 False \n",
+ "956 NaN NaN 0.0 False \n",
+ "\n",
+ " identifier sent_at \n",
+ "0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n",
+ "1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n",
+ "2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n",
+ "3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n",
+ "4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 \n",
+ ".. ... ... \n",
+ "952 7504adad8bb96320eb3afdd4df6e1f60 2022-05-26 00:00:00+02:00 \n",
+ "953 cedebb6e872f539bef8c3f919874e9d7 2020-07-27 00:00:00+02:00 \n",
+ "954 9908279ebbf1f9b250ba689db6a0222b 2022-09-29 00:00:00+02:00 \n",
+ "955 06eb61b839a0cefee4967c67ccb099dc 2020-12-23 00:00:00+01:00 \n",
+ "956 9461cce28ebe3e76fb4b931c35a169b0 2021-06-08 00:00:00+02:00 \n",
+ "\n",
+ "[957 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_campaigns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "b04f39e7-7d53-4734-b125-4dc1843172d6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['bdc2324-data', '10', '10campaign_stats.csv']"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "files_path_2[0].split(\"/\")[1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "id": "d9bd97df-67bf-48ef-812a-975deb890163",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_521/1596461036.py:11: DtypeWarning: Columns (19,20,33,34,35,39) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " df = pd.read_csv(file_in)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# loop to create dataframes from file 2\n",
+ "\n",
+ "files_path = files_path_2\n",
+ "\n",
+ "client_number = files_path[0].split(\"/\")[1]\n",
+ "df_prefix = \"df\" + str(client_number) + \"_\"\n",
+ "\n",
+ "for i in range(len(files_path)) :\n",
+ " current_path = files_path[i]\n",
+ " with fs.open(current_path, mode=\"rb\") as file_in:\n",
+ " df = pd.read_csv(file_in)\n",
+ " # the pattern of the name is df1xxx\n",
+ " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n",
+ " globals()[nom_dataframe] = df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "id": "7f46e38e-413c-48cb-a171-eb6bc7219d9c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "client number :10\n",
+ "prefix used : df10_\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f\"client number :{client_number}\")\n",
+ "print(f\"prefix used : {df_prefix}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "id": "bdfd388c-7971-4f4d-99ef-c5b0435a4567",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['bdc2324-data/10/10campaign_stats.csv',\n",
+ " 'bdc2324-data/10/10campaigns.csv',\n",
+ " 'bdc2324-data/10/10categories.csv',\n",
+ " 'bdc2324-data/10/10countries.csv',\n",
+ " 'bdc2324-data/10/10currencies.csv',\n",
+ " 'bdc2324-data/10/10customer_target_mappings.csv',\n",
+ " 'bdc2324-data/10/10customersplus.csv',\n",
+ " 'bdc2324-data/10/10event_types.csv',\n",
+ " 'bdc2324-data/10/10events.csv',\n",
+ " 'bdc2324-data/10/10facilities.csv',\n",
+ " 'bdc2324-data/10/10link_stats.csv',\n",
+ " 'bdc2324-data/10/10pricing_formulas.csv',\n",
+ " 'bdc2324-data/10/10product_packs.csv',\n",
+ " 'bdc2324-data/10/10products.csv',\n",
+ " 'bdc2324-data/10/10products_groups.csv',\n",
+ " 'bdc2324-data/10/10purchases.csv',\n",
+ " 'bdc2324-data/10/10representation_category_capacities.csv',\n",
+ " 'bdc2324-data/10/10representation_types.csv',\n",
+ " 'bdc2324-data/10/10representations.csv',\n",
+ " 'bdc2324-data/10/10seasons.csv',\n",
+ " 'bdc2324-data/10/10suppliers.csv',\n",
+ " 'bdc2324-data/10/10tags.csv',\n",
+ " 'bdc2324-data/10/10target_types.csv',\n",
+ " 'bdc2324-data/10/10targets.csv',\n",
+ " 'bdc2324-data/10/10tickets.csv',\n",
+ " 'bdc2324-data/10/10type_of_pricing_formulas.csv',\n",
+ " 'bdc2324-data/10/10type_ofs.csv']"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "files_path_2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "id": "e7bd02dc-1925-46ff-9d59-231d18f9f4f1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " number | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " purchase_id | \n",
+ " product_id | \n",
+ " is_from_subscription | \n",
+ " type_of | \n",
+ " supplier_id | \n",
+ " barcode | \n",
+ " identifier | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1799177 | \n",
+ " 60_0_0_0_1_k-5 | \n",
+ " 2021-12-29 07:27:27.868513+01:00 | \n",
+ " 2021-12-29 07:27:27.868513+01:00 | \n",
+ " 409613 | \n",
+ " 321683 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 56c3db5a02c87af7e525676092cb7c4a | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1799178 | \n",
+ " 71_0_0_0_1_k-5 | \n",
+ " 2021-12-29 07:27:27.976380+01:00 | \n",
+ " 2021-12-29 07:27:27.976380+01:00 | \n",
+ " 409613 | \n",
+ " 321684 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 1ecad1dc6b42b4cdb75784dd9dcd9d5c | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1799179 | \n",
+ " 93_0_0_0_1_k-5 | \n",
+ " 2021-12-29 07:27:27.978719+01:00 | \n",
+ " 2021-12-29 07:27:27.978719+01:00 | \n",
+ " 409613 | \n",
+ " 321685 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " b3d207bdb47bcdb27a52f6bae0db7ec2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1799180 | \n",
+ " 103_0_0_0_1_k-5 | \n",
+ " 2021-12-29 07:27:27.984621+01:00 | \n",
+ " 2021-12-29 07:27:27.984621+01:00 | \n",
+ " 409613 | \n",
+ " 321686 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 10df9591b617cc177516e9ddf91ddae3 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1799181 | \n",
+ " 107_0_3_2_1_h-1 | \n",
+ " 2021-12-29 07:27:27.988602+01:00 | \n",
+ " 2021-12-29 07:27:27.988602+01:00 | \n",
+ " 409613 | \n",
+ " 321687 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 3a8c7d5882fe9f20f0f59c8d90c9873c | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 492309 | \n",
+ " 3252232 | \n",
+ " 336359 | \n",
+ " 2023-03-10 01:31:52.543375+01:00 | \n",
+ " 2023-03-10 01:31:52.543375+01:00 | \n",
+ " 710062 | \n",
+ " 572547 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " fc96f582931209501ed186d709664980 | \n",
+ "
\n",
+ " \n",
+ " 492310 | \n",
+ " 3252233 | \n",
+ " 336360 | \n",
+ " 2023-03-10 01:31:52.543869+01:00 | \n",
+ " 2023-03-10 01:31:52.543869+01:00 | \n",
+ " 710062 | \n",
+ " 572547 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " d4ccfb00a9b22b62654bbf98b4d9a5a5 | \n",
+ "
\n",
+ " \n",
+ " 492311 | \n",
+ " 3252234 | \n",
+ " 336361 | \n",
+ " 2023-03-10 01:31:52.545783+01:00 | \n",
+ " 2023-03-10 01:31:52.545783+01:00 | \n",
+ " 710062 | \n",
+ " 572547 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " d5f76662d6571b8eaceaf19c781fa514 | \n",
+ "
\n",
+ " \n",
+ " 492312 | \n",
+ " 3252235 | \n",
+ " 336362 | \n",
+ " 2023-03-10 01:31:52.547043+01:00 | \n",
+ " 2023-03-10 01:31:52.547043+01:00 | \n",
+ " 710062 | \n",
+ " 572547 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 093225db5cd5e06cc8e06242b4cbba37 | \n",
+ "
\n",
+ " \n",
+ " 492313 | \n",
+ " 3252236 | \n",
+ " 336363 | \n",
+ " 2023-03-10 01:31:52.548311+01:00 | \n",
+ " 2023-03-10 01:31:52.548311+01:00 | \n",
+ " 710062 | \n",
+ " 572547 | \n",
+ " False | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 9bace0d0cd7a5ec559aca8ac8bf67700 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
492314 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id number created_at \\\n",
+ "0 1799177 60_0_0_0_1_k-5 2021-12-29 07:27:27.868513+01:00 \n",
+ "1 1799178 71_0_0_0_1_k-5 2021-12-29 07:27:27.976380+01:00 \n",
+ "2 1799179 93_0_0_0_1_k-5 2021-12-29 07:27:27.978719+01:00 \n",
+ "3 1799180 103_0_0_0_1_k-5 2021-12-29 07:27:27.984621+01:00 \n",
+ "4 1799181 107_0_3_2_1_h-1 2021-12-29 07:27:27.988602+01:00 \n",
+ "... ... ... ... \n",
+ "492309 3252232 336359 2023-03-10 01:31:52.543375+01:00 \n",
+ "492310 3252233 336360 2023-03-10 01:31:52.543869+01:00 \n",
+ "492311 3252234 336361 2023-03-10 01:31:52.545783+01:00 \n",
+ "492312 3252235 336362 2023-03-10 01:31:52.547043+01:00 \n",
+ "492313 3252236 336363 2023-03-10 01:31:52.548311+01:00 \n",
+ "\n",
+ " updated_at purchase_id product_id \\\n",
+ "0 2021-12-29 07:27:27.868513+01:00 409613 321683 \n",
+ "1 2021-12-29 07:27:27.976380+01:00 409613 321684 \n",
+ "2 2021-12-29 07:27:27.978719+01:00 409613 321685 \n",
+ "3 2021-12-29 07:27:27.984621+01:00 409613 321686 \n",
+ "4 2021-12-29 07:27:27.988602+01:00 409613 321687 \n",
+ "... ... ... ... \n",
+ "492309 2023-03-10 01:31:52.543375+01:00 710062 572547 \n",
+ "492310 2023-03-10 01:31:52.543869+01:00 710062 572547 \n",
+ "492311 2023-03-10 01:31:52.545783+01:00 710062 572547 \n",
+ "492312 2023-03-10 01:31:52.547043+01:00 710062 572547 \n",
+ "492313 2023-03-10 01:31:52.548311+01:00 710062 572547 \n",
+ "\n",
+ " is_from_subscription type_of supplier_id barcode \\\n",
+ "0 False 1 2 NaN \n",
+ "1 False 1 2 NaN \n",
+ "2 False 1 2 NaN \n",
+ "3 False 1 2 NaN \n",
+ "4 False 1 2 NaN \n",
+ "... ... ... ... ... \n",
+ "492309 False 1 2 NaN \n",
+ "492310 False 1 2 NaN \n",
+ "492311 False 1 2 NaN \n",
+ "492312 False 1 2 NaN \n",
+ "492313 False 1 2 NaN \n",
+ "\n",
+ " identifier \n",
+ "0 56c3db5a02c87af7e525676092cb7c4a \n",
+ "1 1ecad1dc6b42b4cdb75784dd9dcd9d5c \n",
+ "2 b3d207bdb47bcdb27a52f6bae0db7ec2 \n",
+ "3 10df9591b617cc177516e9ddf91ddae3 \n",
+ "4 3a8c7d5882fe9f20f0f59c8d90c9873c \n",
+ "... ... \n",
+ "492309 fc96f582931209501ed186d709664980 \n",
+ "492310 d4ccfb00a9b22b62654bbf98b4d9a5a5 \n",
+ "492311 d5f76662d6571b8eaceaf19c781fa514 \n",
+ "492312 093225db5cd5e06cc8e06242b4cbba37 \n",
+ "492313 9bace0d0cd7a5ec559aca8ac8bf67700 \n",
+ "\n",
+ "[492314 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# example : get the table \n",
+ "\n",
+ "df10_tickets"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "id": "48ae6de5-2353-4fa8-a2a8-20da3b77e2ff",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'\\nfor i in range(len(files_path_1)) :\\n current_path = files_path_1[i]\\n nom_dataframe = \"df\" + re.search(r\\'/([^/]+)\\\\.csv$\\', current_path).group(1)\\n df = globals()[nom_dataframe]\\n print(nom_dataframe)\\n print(df.head(20))\\n'"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# loop to have a look at dataframes from file 1\n",
+ "\n",
+ "\"\"\"\n",
+ "for i in range(len(files_path_1)) :\n",
+ " current_path = files_path_1[i]\n",
+ " nom_dataframe = \"df\" + re.search(r'/([^/]+)\\.csv$', current_path).group(1)\n",
+ " df = globals()[nom_dataframe]\n",
+ " print(nom_dataframe)\n",
+ " print(df.head(20))\n",
+ "\"\"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d72166db-dcef-45bd-9f8c-7cb2ee6bcbde",
+ "metadata": {},
+ "source": [
+ "## Beginning of the exploratory analysis of dataframes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "id": "17966ab2-9038-4dd6-a59c-7739ee05c964",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " lastname | \n",
+ " firstname | \n",
+ " birthdate | \n",
+ " email | \n",
+ " street_id | \n",
+ " created_at | \n",
+ " updated_at | \n",
+ " civility | \n",
+ " is_partner | \n",
+ " ... | \n",
+ " preferred_category | \n",
+ " preferred_supplier | \n",
+ " preferred_formula | \n",
+ " purchase_count | \n",
+ " first_buying_date | \n",
+ " last_visiting_date | \n",
+ " zipcode | \n",
+ " country | \n",
+ " age | \n",
+ " tenant_id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 821538 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email821538 | \n",
+ " 139 | \n",
+ " 2023-07-14 11:43:34.261637+02:00 | \n",
+ " 2023-07-14 11:43:34.261637+02:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 809126 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email809126 | \n",
+ " 1063 | \n",
+ " 2023-05-04 17:17:24.456829+02:00 | \n",
+ " 2023-05-04 17:17:24.456829+02:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 11005 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1063 | \n",
+ " 2017-07-06 03:01:57.242998+02:00 | \n",
+ " 2018-11-12 18:01:18.283492+01:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " zone tarif 1 | \n",
+ " NaN | \n",
+ " invite rp | \n",
+ " 14 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 17663 | \n",
+ " lastname17663 | \n",
+ " firstname17663 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 12731 | \n",
+ " 2018-09-23 02:39:17.778100+02:00 | \n",
+ " 2018-09-23 02:39:17.778100+02:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " zone tarif 1 | \n",
+ " NaN | \n",
+ " detaxe | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 44220 | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 38100 | \n",
+ " lastname38100 | \n",
+ " firstname38100 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 12395 | \n",
+ " 2019-02-11 11:05:58.581121+01:00 | \n",
+ " 2022-12-06 23:15:33.485866+01:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 44100 | \n",
+ " fr | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 98789 | \n",
+ " 766266 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email766266 | \n",
+ " 139 | \n",
+ " 2022-12-06 18:26:04.142337+01:00 | \n",
+ " 2023-05-03 18:01:01.799141+02:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ " 98790 | \n",
+ " 766336 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email766336 | \n",
+ " 139 | \n",
+ " 2022-12-06 18:28:49.139502+01:00 | \n",
+ " 2022-12-06 23:15:33.485866+01:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ " 98791 | \n",
+ " 766348 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email766348 | \n",
+ " 139 | \n",
+ " 2022-12-06 18:28:51.140745+01:00 | \n",
+ " 2022-12-06 23:15:33.485866+01:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ " 98792 | \n",
+ " 766363 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email766363 | \n",
+ " 139 | \n",
+ " 2022-12-06 18:29:44.081056+01:00 | \n",
+ " 2022-12-06 23:15:33.485866+01:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ " 98793 | \n",
+ " 766366 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " email766366 | \n",
+ " 139 | \n",
+ " 2022-12-06 18:29:44.934174+01:00 | \n",
+ " 2022-12-06 23:15:33.485866+01:00 | \n",
+ " NaN | \n",
+ " False | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 875 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
98794 rows × 43 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id lastname firstname birthdate email \\\n",
+ "0 821538 NaN NaN NaN email821538 \n",
+ "1 809126 NaN NaN NaN email809126 \n",
+ "2 11005 NaN NaN NaN NaN \n",
+ "3 17663 lastname17663 firstname17663 NaN NaN \n",
+ "4 38100 lastname38100 firstname38100 NaN NaN \n",
+ "... ... ... ... ... ... \n",
+ "98789 766266 NaN NaN NaN email766266 \n",
+ "98790 766336 NaN NaN NaN email766336 \n",
+ "98791 766348 NaN NaN NaN email766348 \n",
+ "98792 766363 NaN NaN NaN email766363 \n",
+ "98793 766366 NaN NaN NaN email766366 \n",
+ "\n",
+ " street_id created_at \\\n",
+ "0 139 2023-07-14 11:43:34.261637+02:00 \n",
+ "1 1063 2023-05-04 17:17:24.456829+02:00 \n",
+ "2 1063 2017-07-06 03:01:57.242998+02:00 \n",
+ "3 12731 2018-09-23 02:39:17.778100+02:00 \n",
+ "4 12395 2019-02-11 11:05:58.581121+01:00 \n",
+ "... ... ... \n",
+ "98789 139 2022-12-06 18:26:04.142337+01:00 \n",
+ "98790 139 2022-12-06 18:28:49.139502+01:00 \n",
+ "98791 139 2022-12-06 18:28:51.140745+01:00 \n",
+ "98792 139 2022-12-06 18:29:44.081056+01:00 \n",
+ "98793 139 2022-12-06 18:29:44.934174+01:00 \n",
+ "\n",
+ " updated_at civility is_partner ... \\\n",
+ "0 2023-07-14 11:43:34.261637+02:00 NaN False ... \n",
+ "1 2023-05-04 17:17:24.456829+02:00 NaN False ... \n",
+ "2 2018-11-12 18:01:18.283492+01:00 NaN False ... \n",
+ "3 2018-09-23 02:39:17.778100+02:00 NaN False ... \n",
+ "4 2022-12-06 23:15:33.485866+01:00 NaN False ... \n",
+ "... ... ... ... ... \n",
+ "98789 2023-05-03 18:01:01.799141+02:00 NaN False ... \n",
+ "98790 2022-12-06 23:15:33.485866+01:00 NaN False ... \n",
+ "98791 2022-12-06 23:15:33.485866+01:00 NaN False ... \n",
+ "98792 2022-12-06 23:15:33.485866+01:00 NaN False ... \n",
+ "98793 2022-12-06 23:15:33.485866+01:00 NaN False ... \n",
+ "\n",
+ " preferred_category preferred_supplier preferred_formula \\\n",
+ "0 NaN NaN NaN \n",
+ "1 NaN NaN NaN \n",
+ "2 zone tarif 1 NaN invite rp \n",
+ "3 zone tarif 1 NaN detaxe \n",
+ "4 NaN NaN NaN \n",
+ "... ... ... ... \n",
+ "98789 NaN NaN NaN \n",
+ "98790 NaN NaN NaN \n",
+ "98791 NaN NaN NaN \n",
+ "98792 NaN NaN NaN \n",
+ "98793 NaN NaN NaN \n",
+ "\n",
+ " purchase_count first_buying_date last_visiting_date zipcode country \\\n",
+ "0 0 NaN NaN NaN NaN \n",
+ "1 0 NaN NaN NaN fr \n",
+ "2 14 NaN NaN NaN fr \n",
+ "3 1 NaN NaN 44220 fr \n",
+ "4 1 NaN NaN 44100 fr \n",
+ "... ... ... ... ... ... \n",
+ "98789 0 NaN NaN NaN NaN \n",
+ "98790 0 NaN NaN NaN NaN \n",
+ "98791 0 NaN NaN NaN NaN \n",
+ "98792 0 NaN NaN NaN NaN \n",
+ "98793 0 NaN NaN NaN NaN \n",
+ "\n",
+ " age tenant_id \n",
+ "0 NaN 875 \n",
+ "1 NaN 875 \n",
+ "2 NaN 875 \n",
+ "3 NaN 875 \n",
+ "4 NaN 875 \n",
+ "... ... ... \n",
+ "98789 NaN 875 \n",
+ "98790 NaN 875 \n",
+ "98791 NaN 875 \n",
+ "98792 NaN 875 \n",
+ "98793 NaN 875 \n",
+ "\n",
+ "[98794 rows x 43 columns]"
+ ]
+ },
+ "execution_count": 82,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df10_0customersplus"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "932812b1-7a24-4f2d-ae48-7fe8e06b9f62",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# how many missing values ?\n",
+ "\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}