{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5ce2ffc5-66b6-4709-9e2c-7a50f49d1361", "metadata": {}, "outputs": [], "source": [ "# test\n", "\n", "import os \n", "import s3fs\n", "import pandas as pd\n", "import re" ] }, { "cell_type": "code", "execution_count": 35, "id": "f579ff01-f009-4fb1-ba79-0cb3ce58ab7f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/1',\n", " 'bdc2324-data/10',\n", " 'bdc2324-data/101',\n", " 'bdc2324-data/11',\n", " 'bdc2324-data/12',\n", " 'bdc2324-data/13',\n", " 'bdc2324-data/14',\n", " 'bdc2324-data/2',\n", " 'bdc2324-data/3',\n", " 'bdc2324-data/4',\n", " 'bdc2324-data/5',\n", " 'bdc2324-data/6',\n", " 'bdc2324-data/7',\n", " 'bdc2324-data/8',\n", " 'bdc2324-data/9']" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "\n", "fs = s3fs.S3FileSystem(client_kwargs = {\"endpoint_url\" : S3_ENDPOINT_URL})\n", "BUCKET = \"bdc2324-data\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 27, "id": "c8b2c797-271f-43ee-8823-d0aee5b8782d", "metadata": {}, "outputs": [], "source": [ "FILE_PATH_S3 = fs.ls(BUCKET)[1] # +\".csv\"\n", "files_path_2 = fs.ls(FILE_PATH_S3)\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "18cee687-1462-4169-9bfe-f39786135cdd", "metadata": {}, "outputs": [], "source": [ "with fs.open(files_path_1[1], mode=\"rb\") as file_in:\n", " # print(file_in)\n", " df_campaigns = pd.read_csv(file_in)" ] }, { "cell_type": "code", "execution_count": 5, "id": "33e8d14c-c649-4b9c-8290-4a2aa635f999", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnameservice_idcreated_atupdated_atprocess_idreport_urlcategoryto_be_syncedidentifiersent_at
01319613newsletter enseignants janvier 20227212022-01-14 16:06:42.586321+01:002022-02-03 14:17:27.112963+01:00NaNNaN0.0Falseaba3b6fd5d186d28e06ff97135cade7f2022-01-14 00:00:00+01:00
11319586lsf_janvier_20227172022-01-07 11:30:35.315895+01:002022-02-03 14:17:27.116171+01:00NaNNaN0.0False788d986905533aba051261497ecffcbb2022-01-07 00:00:00+01:00
21319282Invitation à déjeuner au Mucem | Vernissage « ...5912021-09-28 12:50:24.448752+02:002022-02-03 14:17:27.119582+01:00NaNNaN0.0False3493894fa4ea036cfc6433c3e2ee63b02021-09-28 00:00:00+02:00
31319283Vacances de la Toussaint - centres des loisirs5902021-09-28 18:01:04.692073+02:002022-02-03 14:17:27.124408+01:00NaNNaN0.0False08b255a5d42b89b0585260b6f2360bdd2021-09-28 00:00:00+02:00
41319636ddcp_promo_md_livemag7302022-01-27 18:00:41.053069+01:002022-02-03 14:17:27.127607+01:00NaNNaN0.0Falsed5cfead94f5350c12c322b5b664544c12022-01-27 00:00:00+01:00
....................................
9521320072dre_gaza01068812022-05-26 09:01:35.523639+02:002022-12-02 17:51:22.614046+01:00NaNNaN0.0False7504adad8bb96320eb3afdd4df6e1f602022-05-26 00:00:00+02:00
953661398DDCP Plan Bis 4 - Marketing direct - MJ5C1832021-06-18 10:30:01.259578+02:002021-09-24 11:56:09.082785+02:00NaNNaN0.0Falsecedebb6e872f539bef8c3f919874e9d72020-07-27 00:00:00+02:00
9541320487Invitation portes ouvertes amitiés9882022-09-29 18:01:33.834090+02:002022-12-02 17:51:23.258324+01:00NaNNaN0.0False9908279ebbf1f9b250ba689db6a0222b2022-09-29 00:00:00+02:00
955906903DDCP PROMO La méditerranée des philosophes #3 ...3102021-07-19 14:07:16.177390+02:002021-09-24 11:56:09.086101+02:00NaNNaN0.0False06eb61b839a0cefee4967c67ccb099dc2020-12-23 00:00:00+01:00
956579313ddcp_promo_automation_manuel_pre_visit4812021-06-08 17:38:54.041310+02:002021-09-24 11:56:09.089394+02:00NaNNaN0.0False9461cce28ebe3e76fb4b931c35a169b02021-06-08 00:00:00+02:00
\n", "

957 rows × 11 columns

\n", "
" ], "text/plain": [ " id name service_id \\\n", "0 1319613 newsletter enseignants janvier 2022 721 \n", "1 1319586 lsf_janvier_2022 717 \n", "2 1319282 Invitation à déjeuner au Mucem | Vernissage « ... 591 \n", "3 1319283 Vacances de la Toussaint - centres des loisirs 590 \n", "4 1319636 ddcp_promo_md_livemag 730 \n", ".. ... ... ... \n", "952 1320072 dre_gaza0106 881 \n", "953 661398 DDCP Plan Bis 4 - Marketing direct - MJ5C 183 \n", "954 1320487 Invitation portes ouvertes amitiés 988 \n", "955 906903 DDCP PROMO La méditerranée des philosophes #3 ... 310 \n", "956 579313 ddcp_promo_automation_manuel_pre_visit 481 \n", "\n", " created_at updated_at \\\n", "0 2022-01-14 16:06:42.586321+01:00 2022-02-03 14:17:27.112963+01:00 \n", "1 2022-01-07 11:30:35.315895+01:00 2022-02-03 14:17:27.116171+01:00 \n", "2 2021-09-28 12:50:24.448752+02:00 2022-02-03 14:17:27.119582+01:00 \n", "3 2021-09-28 18:01:04.692073+02:00 2022-02-03 14:17:27.124408+01:00 \n", "4 2022-01-27 18:00:41.053069+01:00 2022-02-03 14:17:27.127607+01:00 \n", ".. ... ... \n", "952 2022-05-26 09:01:35.523639+02:00 2022-12-02 17:51:22.614046+01:00 \n", "953 2021-06-18 10:30:01.259578+02:00 2021-09-24 11:56:09.082785+02:00 \n", "954 2022-09-29 18:01:33.834090+02:00 2022-12-02 17:51:23.258324+01:00 \n", "955 2021-07-19 14:07:16.177390+02:00 2021-09-24 11:56:09.086101+02:00 \n", "956 2021-06-08 17:38:54.041310+02:00 2021-09-24 11:56:09.089394+02:00 \n", "\n", " process_id report_url category to_be_synced \\\n", "0 NaN NaN 0.0 False \n", "1 NaN NaN 0.0 False \n", "2 NaN NaN 0.0 False \n", "3 NaN NaN 0.0 False \n", "4 NaN NaN 0.0 False \n", ".. ... ... ... ... \n", "952 NaN NaN 0.0 False \n", "953 NaN NaN 0.0 False \n", "954 NaN NaN 0.0 False \n", "955 NaN NaN 0.0 False \n", "956 NaN NaN 0.0 False \n", "\n", " identifier sent_at \n", "0 aba3b6fd5d186d28e06ff97135cade7f 2022-01-14 00:00:00+01:00 \n", "1 788d986905533aba051261497ecffcbb 2022-01-07 00:00:00+01:00 \n", "2 3493894fa4ea036cfc6433c3e2ee63b0 2021-09-28 00:00:00+02:00 \n", "3 08b255a5d42b89b0585260b6f2360bdd 2021-09-28 00:00:00+02:00 \n", "4 d5cfead94f5350c12c322b5b664544c1 2022-01-27 00:00:00+01:00 \n", ".. ... ... \n", "952 7504adad8bb96320eb3afdd4df6e1f60 2022-05-26 00:00:00+02:00 \n", "953 cedebb6e872f539bef8c3f919874e9d7 2020-07-27 00:00:00+02:00 \n", "954 9908279ebbf1f9b250ba689db6a0222b 2022-09-29 00:00:00+02:00 \n", "955 06eb61b839a0cefee4967c67ccb099dc 2020-12-23 00:00:00+01:00 \n", "956 9461cce28ebe3e76fb4b931c35a169b0 2021-06-08 00:00:00+02:00 \n", "\n", "[957 rows x 11 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_campaigns" ] }, { "cell_type": "code", "execution_count": 39, "id": "b04f39e7-7d53-4734-b125-4dc1843172d6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data', '10', '10campaign_stats.csv']" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "files_path_2[0].split(\"/\")[1]" ] }, { "cell_type": "code", "execution_count": 74, "id": "d9bd97df-67bf-48ef-812a-975deb890163", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_521/1596461036.py:11: DtypeWarning: Columns (19,20,33,34,35,39) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } ], "source": [ "# loop to create dataframes from file 2\n", "\n", "files_path = files_path_2\n", "\n", "client_number = files_path[0].split(\"/\")[1]\n", "df_prefix = \"df\" + str(client_number) + \"_\"\n", "\n", "for i in range(len(files_path)) :\n", " current_path = files_path[i]\n", " with fs.open(current_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in)\n", " # the pattern of the name is df1xxx\n", " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", " globals()[nom_dataframe] = df" ] }, { "cell_type": "code", "execution_count": 78, "id": "7f46e38e-413c-48cb-a171-eb6bc7219d9c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "client number :10\n", "prefix used : df10_\n" ] } ], "source": [ "print(f\"client number :{client_number}\")\n", "print(f\"prefix used : {df_prefix}\")" ] }, { "cell_type": "code", "execution_count": 79, "id": "bdfd388c-7971-4f4d-99ef-c5b0435a4567", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/10/10campaign_stats.csv',\n", " 'bdc2324-data/10/10campaigns.csv',\n", " 'bdc2324-data/10/10categories.csv',\n", " 'bdc2324-data/10/10countries.csv',\n", " 'bdc2324-data/10/10currencies.csv',\n", " 'bdc2324-data/10/10customer_target_mappings.csv',\n", " 'bdc2324-data/10/10customersplus.csv',\n", " 'bdc2324-data/10/10event_types.csv',\n", " 'bdc2324-data/10/10events.csv',\n", " 'bdc2324-data/10/10facilities.csv',\n", " 'bdc2324-data/10/10link_stats.csv',\n", " 'bdc2324-data/10/10pricing_formulas.csv',\n", " 'bdc2324-data/10/10product_packs.csv',\n", " 'bdc2324-data/10/10products.csv',\n", " 'bdc2324-data/10/10products_groups.csv',\n", " 'bdc2324-data/10/10purchases.csv',\n", " 'bdc2324-data/10/10representation_category_capacities.csv',\n", " 'bdc2324-data/10/10representation_types.csv',\n", " 'bdc2324-data/10/10representations.csv',\n", " 'bdc2324-data/10/10seasons.csv',\n", " 'bdc2324-data/10/10suppliers.csv',\n", " 'bdc2324-data/10/10tags.csv',\n", " 'bdc2324-data/10/10target_types.csv',\n", " 'bdc2324-data/10/10targets.csv',\n", " 'bdc2324-data/10/10tickets.csv',\n", " 'bdc2324-data/10/10type_of_pricing_formulas.csv',\n", " 'bdc2324-data/10/10type_ofs.csv']" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "files_path_2" ] }, { "cell_type": "code", "execution_count": 80, "id": "e7bd02dc-1925-46ff-9d59-231d18f9f4f1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnumbercreated_atupdated_atpurchase_idproduct_idis_from_subscriptiontype_ofsupplier_idbarcodeidentifier
0179917760_0_0_0_1_k-52021-12-29 07:27:27.868513+01:002021-12-29 07:27:27.868513+01:00409613321683False12NaN56c3db5a02c87af7e525676092cb7c4a
1179917871_0_0_0_1_k-52021-12-29 07:27:27.976380+01:002021-12-29 07:27:27.976380+01:00409613321684False12NaN1ecad1dc6b42b4cdb75784dd9dcd9d5c
2179917993_0_0_0_1_k-52021-12-29 07:27:27.978719+01:002021-12-29 07:27:27.978719+01:00409613321685False12NaNb3d207bdb47bcdb27a52f6bae0db7ec2
31799180103_0_0_0_1_k-52021-12-29 07:27:27.984621+01:002021-12-29 07:27:27.984621+01:00409613321686False12NaN10df9591b617cc177516e9ddf91ddae3
41799181107_0_3_2_1_h-12021-12-29 07:27:27.988602+01:002021-12-29 07:27:27.988602+01:00409613321687False12NaN3a8c7d5882fe9f20f0f59c8d90c9873c
....................................
49230932522323363592023-03-10 01:31:52.543375+01:002023-03-10 01:31:52.543375+01:00710062572547False12NaNfc96f582931209501ed186d709664980
49231032522333363602023-03-10 01:31:52.543869+01:002023-03-10 01:31:52.543869+01:00710062572547False12NaNd4ccfb00a9b22b62654bbf98b4d9a5a5
49231132522343363612023-03-10 01:31:52.545783+01:002023-03-10 01:31:52.545783+01:00710062572547False12NaNd5f76662d6571b8eaceaf19c781fa514
49231232522353363622023-03-10 01:31:52.547043+01:002023-03-10 01:31:52.547043+01:00710062572547False12NaN093225db5cd5e06cc8e06242b4cbba37
49231332522363363632023-03-10 01:31:52.548311+01:002023-03-10 01:31:52.548311+01:00710062572547False12NaN9bace0d0cd7a5ec559aca8ac8bf67700
\n", "

492314 rows × 11 columns

\n", "
" ], "text/plain": [ " id number created_at \\\n", "0 1799177 60_0_0_0_1_k-5 2021-12-29 07:27:27.868513+01:00 \n", "1 1799178 71_0_0_0_1_k-5 2021-12-29 07:27:27.976380+01:00 \n", "2 1799179 93_0_0_0_1_k-5 2021-12-29 07:27:27.978719+01:00 \n", "3 1799180 103_0_0_0_1_k-5 2021-12-29 07:27:27.984621+01:00 \n", "4 1799181 107_0_3_2_1_h-1 2021-12-29 07:27:27.988602+01:00 \n", "... ... ... ... \n", "492309 3252232 336359 2023-03-10 01:31:52.543375+01:00 \n", "492310 3252233 336360 2023-03-10 01:31:52.543869+01:00 \n", "492311 3252234 336361 2023-03-10 01:31:52.545783+01:00 \n", "492312 3252235 336362 2023-03-10 01:31:52.547043+01:00 \n", "492313 3252236 336363 2023-03-10 01:31:52.548311+01:00 \n", "\n", " updated_at purchase_id product_id \\\n", "0 2021-12-29 07:27:27.868513+01:00 409613 321683 \n", "1 2021-12-29 07:27:27.976380+01:00 409613 321684 \n", "2 2021-12-29 07:27:27.978719+01:00 409613 321685 \n", "3 2021-12-29 07:27:27.984621+01:00 409613 321686 \n", "4 2021-12-29 07:27:27.988602+01:00 409613 321687 \n", "... ... ... ... \n", "492309 2023-03-10 01:31:52.543375+01:00 710062 572547 \n", "492310 2023-03-10 01:31:52.543869+01:00 710062 572547 \n", "492311 2023-03-10 01:31:52.545783+01:00 710062 572547 \n", "492312 2023-03-10 01:31:52.547043+01:00 710062 572547 \n", "492313 2023-03-10 01:31:52.548311+01:00 710062 572547 \n", "\n", " is_from_subscription type_of supplier_id barcode \\\n", "0 False 1 2 NaN \n", "1 False 1 2 NaN \n", "2 False 1 2 NaN \n", "3 False 1 2 NaN \n", "4 False 1 2 NaN \n", "... ... ... ... ... \n", "492309 False 1 2 NaN \n", "492310 False 1 2 NaN \n", "492311 False 1 2 NaN \n", "492312 False 1 2 NaN \n", "492313 False 1 2 NaN \n", "\n", " identifier \n", "0 56c3db5a02c87af7e525676092cb7c4a \n", "1 1ecad1dc6b42b4cdb75784dd9dcd9d5c \n", "2 b3d207bdb47bcdb27a52f6bae0db7ec2 \n", "3 10df9591b617cc177516e9ddf91ddae3 \n", "4 3a8c7d5882fe9f20f0f59c8d90c9873c \n", "... ... \n", "492309 fc96f582931209501ed186d709664980 \n", "492310 d4ccfb00a9b22b62654bbf98b4d9a5a5 \n", "492311 d5f76662d6571b8eaceaf19c781fa514 \n", "492312 093225db5cd5e06cc8e06242b4cbba37 \n", "492313 9bace0d0cd7a5ec559aca8ac8bf67700 \n", "\n", "[492314 rows x 11 columns]" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# example : get the table \n", "\n", "df10_tickets" ] }, { "cell_type": "code", "execution_count": 21, "id": "48ae6de5-2353-4fa8-a2a8-20da3b77e2ff", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'\\nfor i in range(len(files_path_1)) :\\n current_path = files_path_1[i]\\n nom_dataframe = \"df\" + re.search(r\\'/([^/]+)\\\\.csv$\\', current_path).group(1)\\n df = globals()[nom_dataframe]\\n print(nom_dataframe)\\n print(df.head(20))\\n'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# loop to have a look at dataframes from file 1\n", "\n", "\"\"\"\n", "for i in range(len(files_path_1)) :\n", " current_path = files_path_1[i]\n", " nom_dataframe = \"df\" + re.search(r'/([^/]+)\\.csv$', current_path).group(1)\n", " df = globals()[nom_dataframe]\n", " print(nom_dataframe)\n", " print(df.head(20))\n", "\"\"\"" ] }, { "cell_type": "markdown", "id": "d72166db-dcef-45bd-9f8c-7cb2ee6bcbde", "metadata": {}, "source": [ "## Beginning of the exploratory analysis of dataframes" ] }, { "cell_type": "code", "execution_count": 82, "id": "17966ab2-9038-4dd6-a59c-7739ee05c964", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlastnamefirstnamebirthdateemailstreet_idcreated_atupdated_atcivilityis_partner...preferred_categorypreferred_supplierpreferred_formulapurchase_countfirst_buying_datelast_visiting_datezipcodecountryagetenant_id
0821538NaNNaNNaNemail8215381392023-07-14 11:43:34.261637+02:002023-07-14 11:43:34.261637+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
1809126NaNNaNNaNemail80912610632023-05-04 17:17:24.456829+02:002023-05-04 17:17:24.456829+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNfrNaN875
211005NaNNaNNaNNaN10632017-07-06 03:01:57.242998+02:002018-11-12 18:01:18.283492+01:00NaNFalse...zone tarif 1NaNinvite rp14NaNNaNNaNfrNaN875
317663lastname17663firstname17663NaNNaN127312018-09-23 02:39:17.778100+02:002018-09-23 02:39:17.778100+02:00NaNFalse...zone tarif 1NaNdetaxe1NaNNaN44220frNaN875
438100lastname38100firstname38100NaNNaN123952019-02-11 11:05:58.581121+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN1NaNNaN44100frNaN875
..................................................................
98789766266NaNNaNNaNemail7662661392022-12-06 18:26:04.142337+01:002023-05-03 18:01:01.799141+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
98790766336NaNNaNNaNemail7663361392022-12-06 18:28:49.139502+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
98791766348NaNNaNNaNemail7663481392022-12-06 18:28:51.140745+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
98792766363NaNNaNNaNemail7663631392022-12-06 18:29:44.081056+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
98793766366NaNNaNNaNemail7663661392022-12-06 18:29:44.934174+01:002022-12-06 23:15:33.485866+01:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN875
\n", "

98794 rows × 43 columns

\n", "
" ], "text/plain": [ " id lastname firstname birthdate email \\\n", "0 821538 NaN NaN NaN email821538 \n", "1 809126 NaN NaN NaN email809126 \n", "2 11005 NaN NaN NaN NaN \n", "3 17663 lastname17663 firstname17663 NaN NaN \n", "4 38100 lastname38100 firstname38100 NaN NaN \n", "... ... ... ... ... ... \n", "98789 766266 NaN NaN NaN email766266 \n", "98790 766336 NaN NaN NaN email766336 \n", "98791 766348 NaN NaN NaN email766348 \n", "98792 766363 NaN NaN NaN email766363 \n", "98793 766366 NaN NaN NaN email766366 \n", "\n", " street_id created_at \\\n", "0 139 2023-07-14 11:43:34.261637+02:00 \n", "1 1063 2023-05-04 17:17:24.456829+02:00 \n", "2 1063 2017-07-06 03:01:57.242998+02:00 \n", "3 12731 2018-09-23 02:39:17.778100+02:00 \n", "4 12395 2019-02-11 11:05:58.581121+01:00 \n", "... ... ... \n", "98789 139 2022-12-06 18:26:04.142337+01:00 \n", "98790 139 2022-12-06 18:28:49.139502+01:00 \n", "98791 139 2022-12-06 18:28:51.140745+01:00 \n", "98792 139 2022-12-06 18:29:44.081056+01:00 \n", "98793 139 2022-12-06 18:29:44.934174+01:00 \n", "\n", " updated_at civility is_partner ... \\\n", "0 2023-07-14 11:43:34.261637+02:00 NaN False ... \n", "1 2023-05-04 17:17:24.456829+02:00 NaN False ... \n", "2 2018-11-12 18:01:18.283492+01:00 NaN False ... \n", "3 2018-09-23 02:39:17.778100+02:00 NaN False ... \n", "4 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", "... ... ... ... ... \n", "98789 2023-05-03 18:01:01.799141+02:00 NaN False ... \n", "98790 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", "98791 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", "98792 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", "98793 2022-12-06 23:15:33.485866+01:00 NaN False ... \n", "\n", " preferred_category preferred_supplier preferred_formula \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 zone tarif 1 NaN invite rp \n", "3 zone tarif 1 NaN detaxe \n", "4 NaN NaN NaN \n", "... ... ... ... \n", "98789 NaN NaN NaN \n", "98790 NaN NaN NaN \n", "98791 NaN NaN NaN \n", "98792 NaN NaN NaN \n", "98793 NaN NaN NaN \n", "\n", " purchase_count first_buying_date last_visiting_date zipcode country \\\n", "0 0 NaN NaN NaN NaN \n", "1 0 NaN NaN NaN fr \n", "2 14 NaN NaN NaN fr \n", "3 1 NaN NaN 44220 fr \n", "4 1 NaN NaN 44100 fr \n", "... ... ... ... ... ... \n", "98789 0 NaN NaN NaN NaN \n", "98790 0 NaN NaN NaN NaN \n", "98791 0 NaN NaN NaN NaN \n", "98792 0 NaN NaN NaN NaN \n", "98793 0 NaN NaN NaN NaN \n", "\n", " age tenant_id \n", "0 NaN 875 \n", "1 NaN 875 \n", "2 NaN 875 \n", "3 NaN 875 \n", "4 NaN 875 \n", "... ... ... \n", "98789 NaN 875 \n", "98790 NaN 875 \n", "98791 NaN 875 \n", "98792 NaN 875 \n", "98793 NaN 875 \n", "\n", "[98794 rows x 43 columns]" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df10_0customersplus" ] }, { "cell_type": "code", "execution_count": null, "id": "932812b1-7a24-4f2d-ae48-7fe8e06b9f62", "metadata": {}, "outputs": [], "source": [ "# how many missing values ?\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }