From 31dd436432ba9afa541e0f5b8d6aa9cec72d7606 Mon Sep 17 00:00:00 2001 From: tpique-ensae Date: Thu, 11 Jan 2024 16:33:11 +0000 Subject: [PATCH] thomas first commit --- TP_access_merge_data.ipynb | 1215 ++++++++++++++++++++++++++++++++++++ 1 file changed, 1215 insertions(+) create mode 100644 TP_access_merge_data.ipynb diff --git a/TP_access_merge_data.ipynb b/TP_access_merge_data.ipynb new file mode 100644 index 0000000..c8adbd5 --- /dev/null +++ b/TP_access_merge_data.ipynb @@ -0,0 +1,1215 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "5ce2ffc5-66b6-4709-9e2c-7a50f49d1361", + "metadata": {}, + "outputs": [], + "source": [ + "# test\n", + "\n", + "import os \n", + "import s3fs\n", + "import pandas as pd\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "f579ff01-f009-4fb1-ba79-0cb3ce58ab7f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1',\n", + " 'bdc2324-data/10',\n", + " 'bdc2324-data/101',\n", + " 'bdc2324-data/11',\n", + " 'bdc2324-data/12',\n", + " 'bdc2324-data/13',\n", + " 'bdc2324-data/14',\n", + " 'bdc2324-data/2',\n", + " 'bdc2324-data/3',\n", + " 'bdc2324-data/4',\n", + " 'bdc2324-data/5',\n", + " 'bdc2324-data/6',\n", + " 'bdc2324-data/7',\n", + " 'bdc2324-data/8',\n", + " 'bdc2324-data/9']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "\n", + "fs = s3fs.S3FileSystem(client_kwargs = {\"endpoint_url\" : S3_ENDPOINT_URL})\n", + "BUCKET = \"bdc2324-data\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "c8b2c797-271f-43ee-8823-d0aee5b8782d", + "metadata": {}, + "outputs": [], + "source": [ + "FILE_PATH_S3 = fs.ls(BUCKET)[1] # +\".csv\"\n", + "files_path_2 = fs.ls(FILE_PATH_S3)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "18cee687-1462-4169-9bfe-f39786135cdd", + "metadata": {}, + "outputs": [], + "source": [ + "with fs.open(files_path_1[1], mode=\"rb\") as file_in:\n", + " # print(file_in)\n", + " df_campaigns = pd.read_csv(file_in)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "33e8d14c-c649-4b9c-8290-4a2aa635f999", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameservice_idcreated_atupdated_atprocess_idreport_urlcategoryto_be_syncedidentifiersent_at
01319613newsletter enseignants janvier 20227212022-01-14 16:06:42.586321+01:002022-02-03 14:17:27.112963+01:00NaNNaN0.0Falseaba3b6fd5d186d28e06ff97135cade7f2022-01-14 00:00:00+01:00
11319586lsf_janvier_20227172022-01-07 11:30:35.315895+01:002022-02-03 14:17:27.116171+01:00NaNNaN0.0False788d986905533aba051261497ecffcbb2022-01-07 00:00:00+01:00
21319282Invitation à déjeuner au Mucem | Vernissage « ...5912021-09-28 12:50:24.448752+02:002022-02-03 14:17:27.119582+01:00NaNNaN0.0False3493894fa4ea036cfc6433c3e2ee63b02021-09-28 00:00:00+02:00
31319283Vacances de la Toussaint - centres des loisirs5902021-09-28 18:01:04.692073+02:002022-02-03 14:17:27.124408+01:00NaNNaN0.0False08b255a5d42b89b0585260b6f2360bdd2021-09-28 00:00:00+02:00
41319636ddcp_promo_md_livemag7302022-01-27 18:00:41.053069+01:002022-02-03 14:17:27.127607+01:00NaNNaN0.0Falsed5cfead94f5350c12c322b5b664544c12022-01-27 00:00:00+01:00
....................................
9521320072dre_gaza01068812022-05-26 09:01:35.523639+02:002022-12-02 17:51:22.614046+01:00NaNNaN0.0False7504adad8bb96320eb3afdd4df6e1f602022-05-26 00:00:00+02:00
953661398DDCP Plan Bis 4 - Marketing direct - MJ5C1832021-06-18 10:30:01.259578+02:002021-09-24 11:56:09.082785+02:00NaNNaN0.0Falsecedebb6e872f539bef8c3f919874e9d72020-07-27 00:00:00+02:00
9541320487Invitation portes ouvertes amitiés9882022-09-29 18:01:33.834090+02:002022-12-02 17:51:23.258324+01:00NaNNaN0.0False9908279ebbf1f9b250ba689db6a0222b2022-09-29 00:00:00+02:00
955906903DDCP PROMO La méditerranée des philosophes #3 ...3102021-07-19 14:07:16.177390+02:002021-09-24 11:56:09.086101+02:00NaNNaN0.0False06eb61b839a0cefee4967c67ccb099dc2020-12-23 00:00:00+01:00
956579313ddcp_promo_automation_manuel_pre_visit4812021-06-08 17:38:54.041310+02:002021-09-24 11:56:09.089394+02:00NaNNaN0.0False9461cce28ebe3e76fb4b931c35a169b02021-06-08 00:00:00+02:00
\n", + "

957 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id name service_id \\\n", + "0 1319613 newsletter enseignants janvier 2022 721 \n", + "1 1319586 lsf_janvier_2022 717 \n", + "2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n", + "3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n", + "4 1319636 ddcp_promo_md_livemag 730 \n", + ".. ... ... ... \n", + "952 1320072 dre_gaza0106 881 \n", + "953 661398 DDCP Plan Bis 4 - Marketing direct - MJ5C 183 \n", + "954 1320487 Invitation portes ouvertes amitiés 988 \n", + "955 906903 DDCP PROMO La méditerranée des philosophes #3 ... 310 \n", + "956 579313 ddcp_promo_automation_manuel_pre_visit 481 \n", + "\n", + " created_at updated_at \\\n", + "0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n", + "1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n", + "2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n", + "3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n", + "4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n", + ".. ... ... \n", + "952 2022-05-26 09:01:35.523639+02:00 2022-12-02 17:51:22.614046+01:00 \n", + "953 2021-06-18 10:30:01.259578+02:00 2021-09-24 11:56:09.082785+02:00 \n", + "954 2022-09-29 18:01:33.834090+02:00 2022-12-02 17:51:23.258324+01:00 \n", + "955 2021-07-19 14:07:16.177390+02:00 2021-09-24 11:56:09.086101+02:00 \n", + "956 2021-06-08 17:38:54.041310+02:00 2021-09-24 11:56:09.089394+02:00 \n", + "\n", + " process_id report_url category to_be_synced \\\n", + "0 NaN NaN 0.0 False \n", + "1 NaN NaN 0.0 False \n", + "2 NaN NaN 0.0 False \n", + "3 NaN NaN 0.0 False \n", + "4 NaN NaN 0.0 False \n", + ".. ... ... ... ... \n", + "952 NaN NaN 0.0 False \n", + "953 NaN NaN 0.0 False \n", + "954 NaN NaN 0.0 False \n", + "955 NaN NaN 0.0 False \n", + "956 NaN NaN 0.0 False \n", + "\n", + " identifier sent_at \n", + "0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n", + "1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n", + "2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n", + "3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n", + "4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 \n", + ".. ... ... \n", + "952 7504adad8bb96320eb3afdd4df6e1f60 2022-05-26 00:00:00+02:00 \n", + "953 cedebb6e872f539bef8c3f919874e9d7 2020-07-27 00:00:00+02:00 \n", + "954 9908279ebbf1f9b250ba689db6a0222b 2022-09-29 00:00:00+02:00 \n", + "955 06eb61b839a0cefee4967c67ccb099dc 2020-12-23 00:00:00+01:00 \n", + "956 9461cce28ebe3e76fb4b931c35a169b0 2021-06-08 00:00:00+02:00 \n", + "\n", + "[957 rows x 11 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_campaigns" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "b04f39e7-7d53-4734-b125-4dc1843172d6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data', '10', '10campaign_stats.csv']" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "files_path_2[0].split(\"/\")[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "d9bd97df-67bf-48ef-812a-975deb890163", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_521/1596461036.py:11: DtypeWarning: Columns (19,20,33,34,35,39) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(file_in)\n" + ] + } + ], + "source": [ + "# loop to create dataframes from file 2\n", + "\n", + "files_path = files_path_2\n", + "\n", + "client_number = files_path[0].split(\"/\")[1]\n", + "df_prefix = \"df\" + str(client_number) + \"_\"\n", + "\n", + "for i in range(len(files_path)) :\n", + " current_path = files_path[i]\n", + " with fs.open(current_path, mode=\"rb\") as file_in:\n", + " df = pd.read_csv(file_in)\n", + " # the pattern of the name is df1xxx\n", + " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", + " globals()[nom_dataframe] = df" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "7f46e38e-413c-48cb-a171-eb6bc7219d9c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "client number :10\n", + "prefix used : df10_\n" + ] + } + ], + "source": [ + "print(f\"client number :{client_number}\")\n", + "print(f\"prefix used : {df_prefix}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "bdfd388c-7971-4f4d-99ef-c5b0435a4567", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/10/10campaign_stats.csv',\n", + " 'bdc2324-data/10/10campaigns.csv',\n", + " 'bdc2324-data/10/10categories.csv',\n", + " 'bdc2324-data/10/10countries.csv',\n", + " 'bdc2324-data/10/10currencies.csv',\n", + " 'bdc2324-data/10/10customer_target_mappings.csv',\n", + " 'bdc2324-data/10/10customersplus.csv',\n", + " 'bdc2324-data/10/10event_types.csv',\n", + " 'bdc2324-data/10/10events.csv',\n", + " 'bdc2324-data/10/10facilities.csv',\n", + " 'bdc2324-data/10/10link_stats.csv',\n", + " 'bdc2324-data/10/10pricing_formulas.csv',\n", + " 'bdc2324-data/10/10product_packs.csv',\n", + " 'bdc2324-data/10/10products.csv',\n", + " 'bdc2324-data/10/10products_groups.csv',\n", + " 'bdc2324-data/10/10purchases.csv',\n", + " 'bdc2324-data/10/10representation_category_capacities.csv',\n", + " 'bdc2324-data/10/10representation_types.csv',\n", + " 'bdc2324-data/10/10representations.csv',\n", + " 'bdc2324-data/10/10seasons.csv',\n", + " 'bdc2324-data/10/10suppliers.csv',\n", + " 'bdc2324-data/10/10tags.csv',\n", + " 'bdc2324-data/10/10target_types.csv',\n", + " 'bdc2324-data/10/10targets.csv',\n", + " 'bdc2324-data/10/10tickets.csv',\n", + " 'bdc2324-data/10/10type_of_pricing_formulas.csv',\n", + " 'bdc2324-data/10/10type_ofs.csv']" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "files_path_2" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "e7bd02dc-1925-46ff-9d59-231d18f9f4f1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnumbercreated_atupdated_atpurchase_idproduct_idis_from_subscriptiontype_ofsupplier_idbarcodeidentifier
0179917760_0_0_0_1_k-52021-12-29 07:27:27.868513+01:002021-12-29 07:27:27.868513+01:00409613321683False12NaN56c3db5a02c87af7e525676092cb7c4a
1179917871_0_0_0_1_k-52021-12-29 07:27:27.976380+01:002021-12-29 07:27:27.976380+01:00409613321684False12NaN1ecad1dc6b42b4cdb75784dd9dcd9d5c
2179917993_0_0_0_1_k-52021-12-29 07:27:27.978719+01:002021-12-29 07:27:27.978719+01:00409613321685False12NaNb3d207bdb47bcdb27a52f6bae0db7ec2
31799180103_0_0_0_1_k-52021-12-29 07:27:27.984621+01:002021-12-29 07:27:27.984621+01:00409613321686False12NaN10df9591b617cc177516e9ddf91ddae3
41799181107_0_3_2_1_h-12021-12-29 07:27:27.988602+01:002021-12-29 07:27:27.988602+01:00409613321687False12NaN3a8c7d5882fe9f20f0f59c8d90c9873c
....................................
49230932522323363592023-03-10 01:31:52.543375+01:002023-03-10 01:31:52.543375+01:00710062572547False12NaNfc96f582931209501ed186d709664980
49231032522333363602023-03-10 01:31:52.543869+01:002023-03-10 01:31:52.543869+01:00710062572547False12NaNd4ccfb00a9b22b62654bbf98b4d9a5a5
49231132522343363612023-03-10 01:31:52.545783+01:002023-03-10 01:31:52.545783+01:00710062572547False12NaNd5f76662d6571b8eaceaf19c781fa514
49231232522353363622023-03-10 01:31:52.547043+01:002023-03-10 01:31:52.547043+01:00710062572547False12NaN093225db5cd5e06cc8e06242b4cbba37
49231332522363363632023-03-10 01:31:52.548311+01:002023-03-10 01:31:52.548311+01:00710062572547False12NaN9bace0d0cd7a5ec559aca8ac8bf67700
\n", + "

492314 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " id number created_at \\\n", + "0 1799177 60_0_0_0_1_k-5 2021-12-29 07:27:27.868513+01:00 \n", + "1 1799178 71_0_0_0_1_k-5 2021-12-29 07:27:27.976380+01:00 \n", + "2 1799179 93_0_0_0_1_k-5 2021-12-29 07:27:27.978719+01:00 \n", + "3 1799180 103_0_0_0_1_k-5 2021-12-29 07:27:27.984621+01:00 \n", + "4 1799181 107_0_3_2_1_h-1 2021-12-29 07:27:27.988602+01:00 \n", + "... ... ... ... \n", + "492309 3252232 336359 2023-03-10 01:31:52.543375+01:00 \n", + "492310 3252233 336360 2023-03-10 01:31:52.543869+01:00 \n", + "492311 3252234 336361 2023-03-10 01:31:52.545783+01:00 \n", + "492312 3252235 336362 2023-03-10 01:31:52.547043+01:00 \n", + "492313 3252236 336363 2023-03-10 01:31:52.548311+01:00 \n", + "\n", + " updated_at purchase_id product_id \\\n", + "0 2021-12-29 07:27:27.868513+01:00 409613 321683 \n", + "1 2021-12-29 07:27:27.976380+01:00 409613 321684 \n", + "2 2021-12-29 07:27:27.978719+01:00 409613 321685 \n", + "3 2021-12-29 07:27:27.984621+01:00 409613 321686 \n", + "4 2021-12-29 07:27:27.988602+01:00 409613 321687 \n", + "... ... ... ... \n", + "492309 2023-03-10 01:31:52.543375+01:00 710062 572547 \n", + "492310 2023-03-10 01:31:52.543869+01:00 710062 572547 \n", + "492311 2023-03-10 01:31:52.545783+01:00 710062 572547 \n", + "492312 2023-03-10 01:31:52.547043+01:00 710062 572547 \n", + "492313 2023-03-10 01:31:52.548311+01:00 710062 572547 \n", + "\n", + " is_from_subscription type_of supplier_id barcode \\\n", + "0 False 1 2 NaN \n", + "1 False 1 2 NaN \n", + "2 False 1 2 NaN \n", + "3 False 1 2 NaN \n", + "4 False 1 2 NaN \n", + "... ... ... ... ... \n", + "492309 False 1 2 NaN \n", + "492310 False 1 2 NaN \n", + "492311 False 1 2 NaN \n", + "492312 False 1 2 NaN \n", + "492313 False 1 2 NaN \n", + "\n", + " identifier \n", + "0 56c3db5a02c87af7e525676092cb7c4a \n", + "1 1ecad1dc6b42b4cdb75784dd9dcd9d5c \n", + "2 b3d207bdb47bcdb27a52f6bae0db7ec2 \n", + "3 10df9591b617cc177516e9ddf91ddae3 \n", + "4 3a8c7d5882fe9f20f0f59c8d90c9873c \n", + "... ... \n", + "492309 fc96f582931209501ed186d709664980 \n", + "492310 d4ccfb00a9b22b62654bbf98b4d9a5a5 \n", + "492311 d5f76662d6571b8eaceaf19c781fa514 \n", + "492312 093225db5cd5e06cc8e06242b4cbba37 \n", + "492313 9bace0d0cd7a5ec559aca8ac8bf67700 \n", + "\n", + "[492314 rows x 11 columns]" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# example : get the table \n", + "\n", + "df10_tickets" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "48ae6de5-2353-4fa8-a2a8-20da3b77e2ff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nfor i in range(len(files_path_1)) :\\n current_path = files_path_1[i]\\n nom_dataframe = \"df\" + re.search(r\\'/([^/]+)\\\\.csv$\\', current_path).group(1)\\n df = globals()[nom_dataframe]\\n print(nom_dataframe)\\n print(df.head(20))\\n'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# loop to have a look at dataframes from file 1\n", + "\n", + "\"\"\"\n", + "for i in range(len(files_path_1)) :\n", + " current_path = files_path_1[i]\n", + " nom_dataframe = \"df\" + re.search(r'/([^/]+)\\.csv$', current_path).group(1)\n", + " df = globals()[nom_dataframe]\n", + " print(nom_dataframe)\n", + " print(df.head(20))\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "d72166db-dcef-45bd-9f8c-7cb2ee6bcbde", + "metadata": {}, + "source": [ + "## Beginning of the exploratory analysis of dataframes" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "17966ab2-9038-4dd6-a59c-7739ee05c964", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlastnamefirstnamebirthdateemailstreet_idcreated_atupdated_atcivilityis_partner...preferred_categorypreferred_supplierpreferred_formulapurchase_countfirst_buying_datelast_visiting_datezipcodecountryagetenant_id
0821538NaNNaNNaNemail8215381392023-07-14 11:43:34.261637+02:002023-07-14 11:43:34.261637+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
1809126NaNNaNNaNemail80912610632023-05-04 17:17:24.456829+02:002023-05-04 17:17:24.456829+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNfrNaN875
211005NaNNaNNaNNaN10632017-07-06 03:01:57.242998+02:002018-11-12 18:01:18.283492+01:00NaNFalse...zone tarif 1NaNinvite rp14NaNNaNNaNfrNaN875
317663lastname17663firstname17663NaNNaN127312018-09-23 02:39:17.778100+02:002018-09-23 02:39:17.778100+02:00NaNFalse...zone tarif 1NaNdetaxe1NaNNaN44220frNaN875
438100lastname38100firstname38100NaNNaN123952019-02-11 11:05:58.581121+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN1NaNNaN44100frNaN875
..................................................................
98789766266NaNNaNNaNemail7662661392022-12-06 18:26:04.142337+01:002023-05-03 18:01:01.799141+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
98790766336NaNNaNNaNemail7663361392022-12-06 18:28:49.139502+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
98791766348NaNNaNNaNemail7663481392022-12-06 18:28:51.140745+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
98792766363NaNNaNNaNemail7663631392022-12-06 18:29:44.081056+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
98793766366NaNNaNNaNemail7663661392022-12-06 18:29:44.934174+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
\n", + "

98794 rows × 43 columns

\n", + "
" + ], + "text/plain": [ + " id lastname firstname birthdate email \\\n", + "0 821538 NaN NaN NaN email821538 \n", + "1 809126 NaN NaN NaN email809126 \n", + "2 11005 NaN NaN NaN NaN \n", + "3 17663 lastname17663 firstname17663 NaN NaN \n", + "4 38100 lastname38100 firstname38100 NaN NaN \n", + "... ... ... ... ... ... \n", + "98789 766266 NaN NaN NaN email766266 \n", + "98790 766336 NaN NaN NaN email766336 \n", + "98791 766348 NaN NaN NaN email766348 \n", + "98792 766363 NaN NaN NaN email766363 \n", + "98793 766366 NaN NaN NaN email766366 \n", + "\n", + " street_id created_at \\\n", + "0 139 2023-07-14 11:43:34.261637+02:00 \n", + "1 1063 2023-05-04 17:17:24.456829+02:00 \n", + "2 1063 2017-07-06 03:01:57.242998+02:00 \n", + "3 12731 2018-09-23 02:39:17.778100+02:00 \n", + "4 12395 2019-02-11 11:05:58.581121+01:00 \n", + "... ... ... \n", + "98789 139 2022-12-06 18:26:04.142337+01:00 \n", + "98790 139 2022-12-06 18:28:49.139502+01:00 \n", + "98791 139 2022-12-06 18:28:51.140745+01:00 \n", + "98792 139 2022-12-06 18:29:44.081056+01:00 \n", + "98793 139 2022-12-06 18:29:44.934174+01:00 \n", + "\n", + " updated_at civility is_partner ... \\\n", + "0 2023-07-14 11:43:34.261637+02:00 NaN False ... \n", + "1 2023-05-04 17:17:24.456829+02:00 NaN False ... \n", + "2 2018-11-12 18:01:18.283492+01:00 NaN False ... \n", + "3 2018-09-23 02:39:17.778100+02:00 NaN False ... \n", + "4 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", + "... ... ... ... ... \n", + "98789 2023-05-03 18:01:01.799141+02:00 NaN False ... \n", + "98790 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", + "98791 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", + "98792 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", + "98793 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", + "\n", + " preferred_category preferred_supplier preferred_formula \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 zone tarif 1 NaN invite rp \n", + "3 zone tarif 1 NaN detaxe \n", + "4 NaN NaN NaN \n", + "... ... ... ... \n", + "98789 NaN NaN NaN \n", + "98790 NaN NaN NaN \n", + "98791 NaN NaN NaN \n", + "98792 NaN NaN NaN \n", + "98793 NaN NaN NaN \n", + "\n", + " purchase_count first_buying_date last_visiting_date zipcode country \\\n", + "0 0 NaN NaN NaN NaN \n", + "1 0 NaN NaN NaN fr \n", + "2 14 NaN NaN NaN fr \n", + "3 1 NaN NaN 44220 fr \n", + "4 1 NaN NaN 44100 fr \n", + "... ... ... ... ... ... \n", + "98789 0 NaN NaN NaN NaN \n", + "98790 0 NaN NaN NaN NaN \n", + "98791 0 NaN NaN NaN NaN \n", + "98792 0 NaN NaN NaN NaN \n", + "98793 0 NaN NaN NaN NaN \n", + "\n", + " age tenant_id \n", + "0 NaN 875 \n", + "1 NaN 875 \n", + "2 NaN 875 \n", + "3 NaN 875 \n", + "4 NaN 875 \n", + "... ... ... \n", + "98789 NaN 875 \n", + "98790 NaN 875 \n", + "98791 NaN 875 \n", + "98792 NaN 875 \n", + "98793 NaN 875 \n", + "\n", + "[98794 rows x 43 columns]" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df10_0customersplus" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "932812b1-7a24-4f2d-ae48-7fe8e06b9f62", + "metadata": {}, + "outputs": [], + "source": [ + "# how many missing values ?\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}