diff --git a/Clean-Notebook.ipynb b/Clean-Notebook.ipynb index 99ea3e5..ad9d465 100644 --- a/Clean-Notebook.ipynb +++ b/Clean-Notebook.ipynb @@ -51,47 +51,49 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 23, "id": "699664b9-eee4-4f8d-a207-e524526560c5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['bdc2324-data/11/11campaign_stats.csv',\n", - " 'bdc2324-data/11/11campaigns.csv',\n", - " 'bdc2324-data/11/11categories.csv',\n", - " 'bdc2324-data/11/11countries.csv',\n", - " 'bdc2324-data/11/11currencies.csv',\n", - " 'bdc2324-data/11/11customer_target_mappings.csv',\n", - " 'bdc2324-data/11/11customersplus.csv',\n", - " 'bdc2324-data/11/11event_types.csv',\n", - " 'bdc2324-data/11/11events.csv',\n", - " 'bdc2324-data/11/11facilities.csv',\n", - " 'bdc2324-data/11/11link_stats.csv',\n", - " 'bdc2324-data/11/11pricing_formulas.csv',\n", - " 'bdc2324-data/11/11product_packs.csv',\n", - " 'bdc2324-data/11/11products.csv',\n", - " 'bdc2324-data/11/11products_groups.csv',\n", - " 'bdc2324-data/11/11purchases.csv',\n", - " 'bdc2324-data/11/11representation_category_capacities.csv',\n", - " 'bdc2324-data/11/11representations.csv',\n", - " 'bdc2324-data/11/11seasons.csv',\n", - " 'bdc2324-data/11/11structure_tag_mappings.csv',\n", - " 'bdc2324-data/11/11suppliers.csv',\n", - " 'bdc2324-data/11/11tags.csv',\n", - " 'bdc2324-data/11/11target_types.csv',\n", - " 'bdc2324-data/11/11targets.csv',\n", - " 'bdc2324-data/11/11tickets.csv']" + "['bdc2324-data/2/2campaign_stats.csv',\n", + " 'bdc2324-data/2/2campaigns.csv',\n", + " 'bdc2324-data/2/2categories.csv',\n", + " 'bdc2324-data/2/2contribution_sites.csv',\n", + " 'bdc2324-data/2/2contributions.csv',\n", + " 'bdc2324-data/2/2countries.csv',\n", + " 'bdc2324-data/2/2currencies.csv',\n", + " 'bdc2324-data/2/2customer_target_mappings.csv',\n", + " 'bdc2324-data/2/2customersplus.csv',\n", + " 'bdc2324-data/2/2event_types.csv',\n", + " 'bdc2324-data/2/2events.csv',\n", + " 'bdc2324-data/2/2facilities.csv',\n", + " 'bdc2324-data/2/2link_stats.csv',\n", + " 'bdc2324-data/2/2pricing_formulas.csv',\n", + " 'bdc2324-data/2/2product_packs.csv',\n", + " 'bdc2324-data/2/2products.csv',\n", + " 'bdc2324-data/2/2products_groups.csv',\n", + " 'bdc2324-data/2/2purchases.csv',\n", + " 'bdc2324-data/2/2representation_category_capacities.csv',\n", + " 'bdc2324-data/2/2representations.csv',\n", + " 'bdc2324-data/2/2seasons.csv',\n", + " 'bdc2324-data/2/2structure_tag_mappings.csv',\n", + " 'bdc2324-data/2/2suppliers.csv',\n", + " 'bdc2324-data/2/2tags.csv',\n", + " 'bdc2324-data/2/2target_types.csv',\n", + " 'bdc2324-data/2/2targets.csv',\n", + " 'bdc2324-data/2/2tickets.csv']" ] }, - "execution_count": 5, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "BUCKET = \"bdc2324-data/11\"\n", + "BUCKET = \"bdc2324-data/2\"\n", "fs.ls(BUCKET)" ] }, @@ -337,6 +339,48 @@ "customer_target_mappings" ] }, + { + "cell_type": "code", + "execution_count": 26, + "id": "c4b6bdcc-9f13-449b-9a8b-c5ca794637be", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([nan])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_target_mappings['extra_field'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "47bc8453-0693-4838-8bd8-4d800a82c496", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([nan])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_target_mappings['name'].unique()" + ] + }, { "cell_type": "code", "execution_count": 11, @@ -381,7 +425,9 @@ "cell_type": "code", "execution_count": 12, "id": "b70488b9-38fc-40a8-9e2f-3330b3f9eef5", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -476,6 +522,1026 @@ "target_types" ] }, + { + "cell_type": "code", + "execution_count": 17, + "id": "8dd74e87-97c2-493d-b19f-971b684078d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'name', 'created_at', 'updated_at', 'identifier'], dtype='object')\n", + "(20, 5)\n", + "\n", + "RangeIndex: 20 entries, 0 to 19\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 20 non-null int64 \n", + " 1 name 19 non-null object\n", + " 2 created_at 20 non-null object\n", + " 3 updated_at 20 non-null object\n", + " 4 identifier 20 non-null object\n", + "dtypes: int64(1), object(4)\n", + "memory usage: 928.0+ bytes\n" + ] + } + ], + "source": [ + "# Tags = clients\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11tags.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " tags = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(tags.columns)\n", + "print(tags.shape)\n", + "tags.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "91d54732-666c-4250-ba91-5c9b83d4712a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecreated_atupdated_atidentifier
02ens-écoles2021-05-07 15:24:19.808501+02:002021-05-07 15:24:19.808501+02:00b6a360c5f84595940c5774f13fd39cc3
11NaN2021-05-07 15:24:19.805589+02:002021-05-07 15:24:19.805589+02:00d41d8cd98f00b204e9800998ecf8427e
24ecoles primaires rennes2021-05-07 15:29:06.388415+02:002021-05-07 15:29:06.388415+02:00ca8649dd64c240d118f60b07d11a7053
35Angers Nantes Opéra2023-01-27 15:59:58.187557+01:002023-01-27 15:59:58.187557+01:00f8f500f937fe312542399299cdc13f7e
46Opéras2023-01-27 16:03:59.654938+01:002023-01-27 16:03:59.654938+01:0022eb2c616983ec7b54a093f84b230505
57Ministère de la Culture2023-01-30 11:22:29.636813+01:002023-01-30 11:22:29.636813+01:001b8c5c08fde000d90905a3d14af7763d
68Orchestres2023-01-30 11:33:56.392799+01:002023-01-30 11:33:56.392799+01:007c2aee0c80642d7e325a450f2dec45e5
79Cooperative2023-01-31 14:44:38.471146+01:002023-01-31 14:44:38.471146+01:006c88c36ffaab88d255865aa3111d7686
810Théâtres2023-01-31 14:45:17.804428+01:002023-01-31 14:45:17.804428+01:00b2c19672df82021702b79482c8cda85a
911La co[opera]tive2023-02-16 17:11:35.004478+01:002023-02-16 17:11:35.004478+01:005dbaa3a1f278c0fcf981d447ad20957a
1012Ville de Rennes2023-02-16 17:37:13.816196+01:002023-02-16 17:37:13.816196+01:00bc483d04d9c3a08f167a3ce64366ca72
1113Ensembles en résidence2023-02-16 17:55:54.877374+01:002023-02-16 17:55:54.877374+01:00e70635e771de13268dccf02bb2abfaf9
1214Ministère2023-02-17 11:17:54.429462+01:002023-02-17 11:17:54.429462+01:00a3f0582853fd19f5b57e3651f8a20e7a
1315Rennes métropole2023-02-17 11:53:24.490786+01:002023-02-17 11:53:24.490786+01:00e98b8db5941b96c29c353b6f2f502055
1416Ville de Rennes - équipements culturels2023-02-17 12:00:10.649104+01:002023-02-17 12:00:10.649104+01:00a44edffc7edb852982efa7f4aa6d0e25
1517Structures culturelles rennaises2023-02-17 12:05:55.583016+01:002023-02-17 12:05:55.583016+01:00241550517e4e3b1c926e9aeab0f621cd
1618Université Rennes 22023-02-17 14:23:44.832959+01:002023-02-17 14:23:44.832959+01:004057c5cee51c4e10aa819f0cf48adc3f
1719Centres chorégraphiques nationaux2023-02-17 15:29:41.827321+01:002023-02-17 15:29:41.827321+01:0041e75941dfb766365498d917abe0102f
1820Télévision2023-02-17 15:46:13.746092+01:002023-02-17 15:46:13.746092+01:0036d6409c539dd79c1f3af8c5948603eb
1921structures culturelles nationales2023-02-17 15:56:00.555722+01:002023-02-17 15:56:00.555722+01:005311cf7e42aac53289e1c4a338d5cfa4
\n", + "
" + ], + "text/plain": [ + " id name \\\n", + "0 2 ens-écoles \n", + "1 1 NaN \n", + "2 4 ecoles primaires rennes \n", + "3 5 Angers Nantes Opéra \n", + "4 6 Opéras \n", + "5 7 Ministère de la Culture \n", + "6 8 Orchestres \n", + "7 9 Cooperative \n", + "8 10 Théâtres \n", + "9 11 La co[opera]tive \n", + "10 12 Ville de Rennes \n", + "11 13 Ensembles en résidence \n", + "12 14 Ministère \n", + "13 15 Rennes métropole \n", + "14 16 Ville de Rennes - équipements culturels \n", + "15 17 Structures culturelles rennaises \n", + "16 18 Université Rennes 2 \n", + "17 19 Centres chorégraphiques nationaux \n", + "18 20 Télévision \n", + "19 21 structures culturelles nationales \n", + "\n", + " created_at updated_at \\\n", + "0 2021-05-07 15:24:19.808501+02:00 2021-05-07 15:24:19.808501+02:00 \n", + "1 2021-05-07 15:24:19.805589+02:00 2021-05-07 15:24:19.805589+02:00 \n", + "2 2021-05-07 15:29:06.388415+02:00 2021-05-07 15:29:06.388415+02:00 \n", + "3 2023-01-27 15:59:58.187557+01:00 2023-01-27 15:59:58.187557+01:00 \n", + "4 2023-01-27 16:03:59.654938+01:00 2023-01-27 16:03:59.654938+01:00 \n", + "5 2023-01-30 11:22:29.636813+01:00 2023-01-30 11:22:29.636813+01:00 \n", + "6 2023-01-30 11:33:56.392799+01:00 2023-01-30 11:33:56.392799+01:00 \n", + "7 2023-01-31 14:44:38.471146+01:00 2023-01-31 14:44:38.471146+01:00 \n", + "8 2023-01-31 14:45:17.804428+01:00 2023-01-31 14:45:17.804428+01:00 \n", + "9 2023-02-16 17:11:35.004478+01:00 2023-02-16 17:11:35.004478+01:00 \n", + "10 2023-02-16 17:37:13.816196+01:00 2023-02-16 17:37:13.816196+01:00 \n", + "11 2023-02-16 17:55:54.877374+01:00 2023-02-16 17:55:54.877374+01:00 \n", + "12 2023-02-17 11:17:54.429462+01:00 2023-02-17 11:17:54.429462+01:00 \n", + "13 2023-02-17 11:53:24.490786+01:00 2023-02-17 11:53:24.490786+01:00 \n", + "14 2023-02-17 12:00:10.649104+01:00 2023-02-17 12:00:10.649104+01:00 \n", + "15 2023-02-17 12:05:55.583016+01:00 2023-02-17 12:05:55.583016+01:00 \n", + "16 2023-02-17 14:23:44.832959+01:00 2023-02-17 14:23:44.832959+01:00 \n", + "17 2023-02-17 15:29:41.827321+01:00 2023-02-17 15:29:41.827321+01:00 \n", + "18 2023-02-17 15:46:13.746092+01:00 2023-02-17 15:46:13.746092+01:00 \n", + "19 2023-02-17 15:56:00.555722+01:00 2023-02-17 15:56:00.555722+01:00 \n", + "\n", + " identifier \n", + "0 b6a360c5f84595940c5774f13fd39cc3 \n", + "1 d41d8cd98f00b204e9800998ecf8427e \n", + "2 ca8649dd64c240d118f60b07d11a7053 \n", + "3 f8f500f937fe312542399299cdc13f7e \n", + "4 22eb2c616983ec7b54a093f84b230505 \n", + "5 1b8c5c08fde000d90905a3d14af7763d \n", + "6 7c2aee0c80642d7e325a450f2dec45e5 \n", + "7 6c88c36ffaab88d255865aa3111d7686 \n", + "8 b2c19672df82021702b79482c8cda85a \n", + "9 5dbaa3a1f278c0fcf981d447ad20957a \n", + "10 bc483d04d9c3a08f167a3ce64366ca72 \n", + "11 e70635e771de13268dccf02bb2abfaf9 \n", + "12 a3f0582853fd19f5b57e3651f8a20e7a \n", + "13 e98b8db5941b96c29c353b6f2f502055 \n", + "14 a44edffc7edb852982efa7f4aa6d0e25 \n", + "15 241550517e4e3b1c926e9aeab0f621cd \n", + "16 4057c5cee51c4e10aa819f0cf48adc3f \n", + "17 41e75941dfb766365498d917abe0102f \n", + "18 36d6409c539dd79c1f3af8c5948603eb \n", + "19 5311cf7e42aac53289e1c4a338d5cfa4 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tags" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4cc9f444-b7e6-4ee5-8ce8-64c63ab7825a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'structure_id', 'tag_id', 'created_at', 'updated_at'], dtype='object')\n", + "(179, 5)\n", + "\n", + "RangeIndex: 179 entries, 0 to 178\n", + "Data columns (total 5 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 179 non-null int64 \n", + " 1 structure_id 179 non-null int64 \n", + " 2 tag_id 179 non-null int64 \n", + " 3 created_at 179 non-null object\n", + " 4 updated_at 179 non-null object\n", + "dtypes: int64(3), object(2)\n", + "memory usage: 7.1+ KB\n" + ] + } + ], + "source": [ + "# Structure = clients\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11structure_tag_mappings.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " structure_tag_mappings = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(structure_tag_mappings.columns)\n", + "print(structure_tag_mappings.shape)\n", + "structure_tag_mappings.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "dcf776df-5c8e-4972-b2c1-b41291ba7e66", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idstructure_idtag_idcreated_atupdated_at
012318762023-01-27 16:03:59.680222+01:002023-01-27 16:03:59.680222+01:00
12222021-05-07 15:24:19.872895+02:002021-05-07 15:24:19.872895+02:00
23322021-05-07 15:24:19.873830+02:002021-05-07 15:24:19.873830+02:00
34422021-05-07 15:24:19.874628+02:002021-05-07 15:24:19.874628+02:00
45522021-05-07 15:24:19.875421+02:002021-05-07 15:24:19.875421+02:00
..................
174184236102023-02-17 16:35:25.041114+01:002023-02-17 16:35:25.041114+01:00
175185237172023-02-17 16:39:10.799478+01:002023-02-17 16:39:10.799478+01:00
176186238192023-02-17 16:53:21.098690+01:002023-02-17 16:53:21.098690+01:00
177187239102023-02-17 16:57:42.623481+01:002023-02-17 16:57:42.623481+01:00
178188240102023-02-17 16:59:22.067723+01:002023-02-17 16:59:22.067723+01:00
\n", + "

179 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " id structure_id tag_id created_at \\\n", + "0 123 187 6 2023-01-27 16:03:59.680222+01:00 \n", + "1 2 2 2 2021-05-07 15:24:19.872895+02:00 \n", + "2 3 3 2 2021-05-07 15:24:19.873830+02:00 \n", + "3 4 4 2 2021-05-07 15:24:19.874628+02:00 \n", + "4 5 5 2 2021-05-07 15:24:19.875421+02:00 \n", + ".. ... ... ... ... \n", + "174 184 236 10 2023-02-17 16:35:25.041114+01:00 \n", + "175 185 237 17 2023-02-17 16:39:10.799478+01:00 \n", + "176 186 238 19 2023-02-17 16:53:21.098690+01:00 \n", + "177 187 239 10 2023-02-17 16:57:42.623481+01:00 \n", + "178 188 240 10 2023-02-17 16:59:22.067723+01:00 \n", + "\n", + " updated_at \n", + "0 2023-01-27 16:03:59.680222+01:00 \n", + "1 2021-05-07 15:24:19.872895+02:00 \n", + "2 2021-05-07 15:24:19.873830+02:00 \n", + "3 2021-05-07 15:24:19.874628+02:00 \n", + "4 2021-05-07 15:24:19.875421+02:00 \n", + ".. ... \n", + "174 2023-02-17 16:35:25.041114+01:00 \n", + "175 2023-02-17 16:39:10.799478+01:00 \n", + "176 2023-02-17 16:53:21.098690+01:00 \n", + "177 2023-02-17 16:57:42.623481+01:00 \n", + "178 2023-02-17 16:59:22.067723+01:00 \n", + "\n", + "[179 rows x 5 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "structure_tag_mappings" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "41bf1529-5a7c-409e-9791-2024c08c11f0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", + " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", + " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", + " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", + " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", + " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", + " 'average_purchase_delay', 'average_price_basket',\n", + " 'average_ticket_basket', 'total_price', 'preferred_category',\n", + " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", + " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", + " 'tenant_id'],\n", + " dtype='object')\n", + "(71307, 43)\n", + "\n", + "RangeIndex: 71307 entries, 0 to 71306\n", + "Data columns (total 43 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 71307 non-null int64 \n", + " 1 lastname 41045 non-null object \n", + " 2 firstname 39140 non-null object \n", + " 3 birthdate 18174 non-null object \n", + " 4 email 58203 non-null object \n", + " 5 street_id 71307 non-null int64 \n", + " 6 created_at 71307 non-null object \n", + " 7 updated_at 71307 non-null object \n", + " 8 civility 0 non-null float64\n", + " 9 is_partner 71307 non-null bool \n", + " 10 extra 0 non-null float64\n", + " 11 deleted_at 0 non-null float64\n", + " 12 reference 0 non-null float64\n", + " 13 gender 71307 non-null int64 \n", + " 14 is_email_true 71307 non-null bool \n", + " 15 extra_field 0 non-null float64\n", + " 16 identifier 71307 non-null object \n", + " 17 opt_in 71307 non-null bool \n", + " 18 structure_id 616 non-null float64\n", + " 19 note 451 non-null object \n", + " 20 profession 812 non-null object \n", + " 21 language 0 non-null float64\n", + " 22 mcp_contact_id 22417 non-null float64\n", + " 23 need_reload 71307 non-null bool \n", + " 24 last_buying_date 34040 non-null object \n", + " 25 max_price 34040 non-null float64\n", + " 26 ticket_sum 71307 non-null int64 \n", + " 27 average_price 68694 non-null float64\n", + " 28 fidelity 71307 non-null int64 \n", + " 29 average_purchase_delay 34040 non-null float64\n", + " 30 average_price_basket 34040 non-null float64\n", + " 31 average_ticket_basket 34040 non-null float64\n", + " 32 total_price 36653 non-null float64\n", + " 33 preferred_category 0 non-null float64\n", + " 34 preferred_supplier 0 non-null float64\n", + " 35 preferred_formula 0 non-null float64\n", + " 36 purchase_count 71307 non-null int64 \n", + " 37 first_buying_date 34040 non-null object \n", + " 38 last_visiting_date 0 non-null float64\n", + " 39 zipcode 33756 non-null object \n", + " 40 country 39910 non-null object \n", + " 41 age 18174 non-null float64\n", + " 42 tenant_id 71307 non-null int64 \n", + "dtypes: bool(4), float64(19), int64(7), object(13)\n", + "memory usage: 21.5+ MB\n" + ] + } + ], + "source": [ + "# Tags = clients\n", + "FILE_PATH_S3 = 'bdc2324-data/11/11customersplus.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " customersplus = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "print(customersplus.columns)\n", + "print(customersplus.shape)\n", + "customersplus.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "948a0b2b-8d1c-4afb-802e-670d67dd8c20", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlastnamefirstnamebirthdateemailstreet_idcreated_atupdated_atcivilityis_partner...preferred_categorypreferred_supplierpreferred_formulapurchase_countfirst_buying_datelast_visiting_datezipcodecountryagetenant_id
0286834lastname286834firstname286834NaNemail28683462022-05-19 10:09:09.361137+02:002022-05-19 10:09:09.361137+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNfrNaN1556
1330695NaNNaNNaNemail33069512022-07-16 04:10:34.135134+02:002022-07-16 04:10:34.156704+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
2330978NaNNaNNaNemail33097812022-07-21 22:14:09.811721+02:002022-07-21 22:14:09.836051+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
3338697NaNNaNNaNemail33869712022-09-15 19:02:03.950536+02:002022-09-15 19:02:03.985642+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
4338726NaNNaNNaNemail33872612022-09-16 01:24:40.719882+02:002022-09-16 01:24:40.742753+02:00NaNFalse...NaNNaNNaN0NaNNaNNaNNaNNaN1556
..................................................................
7130227105lastname27105firstname271051957-01-26email271052050242021-04-22 15:12:59.986534+02:002023-09-12 18:59:31.613235+02:00NaNFalse...NaNNaNNaN22018-12-31 18:56:57+01:00NaN35700fr66.01556
7130327108lastname27108firstname27108NaNNaN2050242021-04-22 15:12:59.989197+02:002023-09-12 18:27:34.380843+02:00NaNFalse...NaNNaNNaN62015-12-29 14:51:46+01:00NaN35700frNaN1556
7130427110lastname27110firstname27110NaNNaN62021-04-22 15:12:59.991029+02:002022-04-14 11:41:33.738500+02:00NaNFalse...NaNNaNNaN12018-12-31 19:12:59+01:00NaNNaNfrNaN1556
7130510607lastname10607firstname106071963-01-04email106073133322021-04-22 14:56:45.742226+02:002023-09-12 17:55:17.723195+02:00NaNFalse...NaNNaNNaN262015-10-10 14:11:21+02:00NaN35850fr60.01556
7130619095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...NaNNaNNaN22019-05-19 21:18:36+02:00NaNNaNfr44.01556
\n", + "

71307 rows × 43 columns

\n", + "
" + ], + "text/plain": [ + " id lastname firstname birthdate email \\\n", + "0 286834 lastname286834 firstname286834 NaN email286834 \n", + "1 330695 NaN NaN NaN email330695 \n", + "2 330978 NaN NaN NaN email330978 \n", + "3 338697 NaN NaN NaN email338697 \n", + "4 338726 NaN NaN NaN email338726 \n", + "... ... ... ... ... ... \n", + "71302 27105 lastname27105 firstname27105 1957-01-26 email27105 \n", + "71303 27108 lastname27108 firstname27108 NaN NaN \n", + "71304 27110 lastname27110 firstname27110 NaN NaN \n", + "71305 10607 lastname10607 firstname10607 1963-01-04 email10607 \n", + "71306 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", + "\n", + " street_id created_at \\\n", + "0 6 2022-05-19 10:09:09.361137+02:00 \n", + "1 1 2022-07-16 04:10:34.135134+02:00 \n", + "2 1 2022-07-21 22:14:09.811721+02:00 \n", + "3 1 2022-09-15 19:02:03.950536+02:00 \n", + "4 1 2022-09-16 01:24:40.719882+02:00 \n", + "... ... ... \n", + "71302 205024 2021-04-22 15:12:59.986534+02:00 \n", + "71303 205024 2021-04-22 15:12:59.989197+02:00 \n", + "71304 6 2021-04-22 15:12:59.991029+02:00 \n", + "71305 313332 2021-04-22 14:56:45.742226+02:00 \n", + "71306 6 2021-04-22 15:06:30.120537+02:00 \n", + "\n", + " updated_at civility is_partner ... \\\n", + "0 2022-05-19 10:09:09.361137+02:00 NaN False ... \n", + "1 2022-07-16 04:10:34.156704+02:00 NaN False ... \n", + "2 2022-07-21 22:14:09.836051+02:00 NaN False ... \n", + "3 2022-09-15 19:02:03.985642+02:00 NaN False ... \n", + "4 2022-09-16 01:24:40.742753+02:00 NaN False ... \n", + "... ... ... ... ... \n", + "71302 2023-09-12 18:59:31.613235+02:00 NaN False ... \n", + "71303 2023-09-12 18:27:34.380843+02:00 NaN False ... \n", + "71304 2022-04-14 11:41:33.738500+02:00 NaN False ... \n", + "71305 2023-09-12 17:55:17.723195+02:00 NaN False ... \n", + "71306 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", + "\n", + " preferred_category preferred_supplier preferred_formula \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "... ... ... ... \n", + "71302 NaN NaN NaN \n", + "71303 NaN NaN NaN \n", + "71304 NaN NaN NaN \n", + "71305 NaN NaN NaN \n", + "71306 NaN NaN NaN \n", + "\n", + " purchase_count first_buying_date last_visiting_date zipcode \\\n", + "0 0 NaN NaN NaN \n", + "1 0 NaN NaN NaN \n", + "2 0 NaN NaN NaN \n", + "3 0 NaN NaN NaN \n", + "4 0 NaN NaN NaN \n", + "... ... ... ... ... \n", + "71302 2 2018-12-31 18:56:57+01:00 NaN 35700 \n", + "71303 6 2015-12-29 14:51:46+01:00 NaN 35700 \n", + "71304 1 2018-12-31 19:12:59+01:00 NaN NaN \n", + "71305 26 2015-10-10 14:11:21+02:00 NaN 35850 \n", + "71306 2 2019-05-19 21:18:36+02:00 NaN NaN \n", + "\n", + " country age tenant_id \n", + "0 fr NaN 1556 \n", + "1 NaN NaN 1556 \n", + "2 NaN NaN 1556 \n", + "3 NaN NaN 1556 \n", + "4 NaN NaN 1556 \n", + "... ... ... ... \n", + "71302 fr 66.0 1556 \n", + "71303 fr NaN 1556 \n", + "71304 fr NaN 1556 \n", + "71305 fr 60.0 1556 \n", + "71306 fr 44.0 1556 \n", + "\n", + "[71307 rows x 43 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customersplus" + ] + }, { "cell_type": "code", "execution_count": 6,