From 78664094c5bf5d9e8bcbe816c9ab27c88942f2f6 Mon Sep 17 00:00:00 2001 From: frodrigue-ensae Date: Wed, 10 Jan 2024 18:24:41 +0000 Subject: [PATCH] update --- Notebook_Fanta.ipynb | 821 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 819 insertions(+), 2 deletions(-) diff --git a/Notebook_Fanta.ipynb b/Notebook_Fanta.ipynb index 363fcab..5fd0f44 100644 --- a/Notebook_Fanta.ipynb +++ b/Notebook_Fanta.ipynb @@ -1,6 +1,823 @@ { - "cells": [], - "metadata": {}, + "cells": [ + { + "cell_type": "markdown", + "id": "aa74dbe0-f974-4b5c-94f4-4dba9fbc64fa", + "metadata": {}, + "source": [ + "# Business Data Challenge - Team 1" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "94c498e7-7c50-45f9-b3f4-a1ab19b7ccc4", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "7a3b50ac-b1ff-4f3d-9938-e048fdc8e027", + "metadata": {}, + "source": [ + "Configuration de l'accès aux données" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0b029d42-fb02-481e-a407-7e41886198a6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bdc2324-data/1',\n", + " 'bdc2324-data/10',\n", + " 'bdc2324-data/101',\n", + " 'bdc2324-data/11',\n", + " 'bdc2324-data/12',\n", + " 'bdc2324-data/13',\n", + " 'bdc2324-data/14',\n", + " 'bdc2324-data/2',\n", + " 'bdc2324-data/3',\n", + " 'bdc2324-data/4',\n", + " 'bdc2324-data/5',\n", + " 'bdc2324-data/6',\n", + " 'bdc2324-data/7',\n", + " 'bdc2324-data/8',\n", + " 'bdc2324-data/9']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "import s3fs\n", + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", + "\n", + "BUCKET = \"bdc2324-data\"\n", + "fs.ls(BUCKET)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fbaf9aa7-ff70-4dbe-a969-b801c593510b", + "metadata": {}, + "outputs": [], + "source": [ + "# Chargement des fichiers campaign_stats.csv\n", + "FILE_PATH_S3 = 'bdc2324-data/1/1campaign_stats.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " campaign_stats_1 = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "FILE_PATH_S3 = 'bdc2324-data/2/2campaign_stats.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " campaign_stats_2 = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "FILE_PATH_S3 = 'bdc2324-data/3/3campaign_stats.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " campaign_stats_3 = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1e0418bc-8e97-4a04-b7f3-bda3bef7d36e", + "metadata": {}, + "outputs": [], + "source": [ + "# Conversion des dates 'sent_at'\n", + "campaign_stats_1['sent_at'] = pd.to_datetime(campaign_stats_1['sent_at'], format = 'ISO8601', utc = True)\n", + "campaign_stats_2['sent_at'] = pd.to_datetime(campaign_stats_2['sent_at'], format = 'ISO8601', utc = True)\n", + "campaign_stats_3['sent_at'] = pd.to_datetime(campaign_stats_3['sent_at'], format = 'ISO8601', utc = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cc5c20ba-e827-4e5a-97a5-7f3947e0621c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-11-09 18:10:45+00:00\n", + "2020-06-02 08:24:08+00:00\n", + "2023-10-12 01:39:48+00:00\n", + "2023-10-10 17:06:29+00:00\n", + "2023-11-01 09:20:48+00:00\n", + "2021-03-31 14:59:02+00:00\n" + ] + } + ], + "source": [ + "# Chaque unites correspond à une période ? --> Non, les dossiers ont juste pour but de réduire la taille des fichiers\n", + "print(campaign_stats_1['sent_at'].max())\n", + "print(campaign_stats_1['sent_at'].min())\n", + "\n", + "print(campaign_stats_2['sent_at'].max())\n", + "print(campaign_stats_2['sent_at'].min())\n", + "\n", + "print(campaign_stats_3['sent_at'].max())\n", + "print(campaign_stats_3['sent_at'].min())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c75632df-b018-4bb8-a99d-83f15af94369", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2021-03-28 16:01:09+00:00\n", + "1 2021-03-28 16:01:09+00:00\n", + "2 2021-03-28 16:00:59+00:00\n", + "3 2021-03-28 16:00:59+00:00\n", + "4 2021-03-28 16:01:06+00:00\n", + " ... \n", + "6214803 2023-10-23 09:32:33+00:00\n", + "6214804 2023-10-23 09:32:49+00:00\n", + "6214805 2023-10-23 09:33:28+00:00\n", + "6214806 2023-10-23 09:31:53+00:00\n", + "6214807 2023-10-23 09:33:54+00:00\n", + "Name: sent_at, Length: 6214808, dtype: datetime64[ns, UTC]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "campaign_stats_1['sent_at']" + ] + }, + { + "cell_type": "markdown", + "id": "f4c0c63e-0418-4cfe-a57d-7af57bca0c22", + "metadata": {}, + "source": [ + "### Customersplus.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d3bf880d-1065-4d5b-9954-1830aa5081af", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1362/4118060109.py:9: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " customers_plus_2 = pd.read_csv(file_in, sep=\",\")\n" + ] + } + ], + "source": [ + "FILE_PATH_S3 = 'bdc2324-data/1/1customersplus.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " customers_plus_1 = pd.read_csv(file_in, sep=\",\")\n", + "\n", + "FILE_PATH_S3 = 'bdc2324-data/2/2customersplus.csv'\n", + "\n", + "with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " customers_plus_2 = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7368f381-db8e-4a4d-9fe2-5947eb55be58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id', 'lastname', 'firstname', 'birthdate', 'email', 'street_id',\n", + " 'created_at', 'updated_at', 'civility', 'is_partner', 'extra',\n", + " 'deleted_at', 'reference', 'gender', 'is_email_true', 'extra_field',\n", + " 'identifier', 'opt_in', 'structure_id', 'note', 'profession',\n", + " 'language', 'mcp_contact_id', 'need_reload', 'last_buying_date',\n", + " 'max_price', 'ticket_sum', 'average_price', 'fidelity',\n", + " 'average_purchase_delay', 'average_price_basket',\n", + " 'average_ticket_basket', 'total_price', 'preferred_category',\n", + " 'preferred_supplier', 'preferred_formula', 'purchase_count',\n", + " 'first_buying_date', 'last_visiting_date', 'zipcode', 'country', 'age',\n", + " 'tenant_id'],\n", + " dtype='object')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customers_plus_1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08091935-b159-47fa-806c-e1444f3b227e", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f8c8868-c1ac-4cee-af08-533d928f6764", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_1['id'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf95daf2-4852-4718-b474-207a1ebd8ac4", + "metadata": {}, + "outputs": [], + "source": [ + "customers_plus_2['id'].nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1425c385-3216-4e4f-ae8f-a121624721ba", + "metadata": {}, + "outputs": [], + "source": [ + "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "92533026-e27c-4f1f-81ca-64eda32a34c0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "common_id = set(customers_plus_2['id']).intersection(customers_plus_1['id'])\n", + "# Exemple id commun = caractéristiques communes\n", + "print(customers_plus_2[customers_plus_2['id'] == list(common_id)[0]])\n", + "\n", + "print(customers_plus_1[customers_plus_1['id'] == list(common_id)[0]])" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "bf9ebc94-0ba6-443d-8e53-22477a6e79a7", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0.000000\n", + "lastname 43.461341\n", + "firstname 44.995588\n", + "birthdate 96.419870\n", + "email 8.622075\n", + "street_id 0.000000\n", + "created_at 0.000000\n", + "updated_at 0.000000\n", + "civility 100.000000\n", + "is_partner 0.000000\n", + "extra 100.000000\n", + "deleted_at 100.000000\n", + "reference 100.000000\n", + "gender 0.000000\n", + "is_email_true 0.000000\n", + "extra_field 100.000000\n", + "identifier 0.000000\n", + "opt_in 0.000000\n", + "structure_id 88.072380\n", + "note 99.403421\n", + "profession 95.913503\n", + "language 99.280945\n", + "mcp_contact_id 34.876141\n", + "need_reload 0.000000\n", + "last_buying_date 51.653431\n", + "max_price 51.653431\n", + "ticket_sum 0.000000\n", + "average_price 8.639195\n", + "fidelity 0.000000\n", + "average_purchase_delay 51.653431\n", + "average_price_basket 51.653431\n", + "average_ticket_basket 51.653431\n", + "total_price 43.014236\n", + "preferred_category 100.000000\n", + "preferred_supplier 100.000000\n", + "preferred_formula 100.000000\n", + "purchase_count 0.000000\n", + "first_buying_date 51.653431\n", + "last_visiting_date 100.000000\n", + "zipcode 71.176564\n", + "country 5.459418\n", + "age 96.419870\n", + "tenant_id 0.000000\n", + "dtype: float64\n" + ] + } + ], + "source": [ + "pd.DataFrame(customers_plus_1.isna().mean()*100)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6d62e73f-3925-490f-9fd4-d0e838903cb2", + "metadata": {}, + "outputs": [], + "source": [ + "# Chargement de toutes les données\n", + "liste_base = ['customer_target_mappings', 'customersplus', 'target_types', 'tags', 'events', 'tickets', 'representations', 'purchases', 'products']\n", + "\n", + "for nom_base in liste_base:\n", + " FILE_PATH_S3 = 'bdc2324-data/11/11' + nom_base + '.csv'\n", + " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "12b24f1c-eb3e-45be-aaf3-b9273180caa3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idlastnamefirstnamebirthdateemailstreet_idcreated_atupdated_atcivilityis_partner...tenant_idid_xcustomer_idpurchase_datetype_ofis_from_subscriptionamountis_full_pricestart_date_timeevent_name
0405082lastname405082NaNNaNNaN62023-01-12 06:30:31.197484+01:002023-01-12 06:30:31.197484+01:00NaNFalse...15569924234050822023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
1405082lastname405082NaNNaNNaN62023-01-12 06:30:31.197484+01:002023-01-12 06:30:31.197484+01:00NaNFalse...15569924234050822023-01-11 17:08:41+01:003False13.0False2023-02-06 20:00:00+01:00zaide
2411168lastname411168NaNNaNNaN62023-03-17 06:30:35.431967+01:002023-03-17 06:30:35.431967+01:00NaNFalse...155610539344111682023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
3411168lastname411168NaNNaNNaN62023-03-17 06:30:35.431967+01:002023-03-17 06:30:35.431967+01:00NaNFalse...155610539344111682023-03-16 16:23:10+01:003False62.0False2023-03-19 16:00:00+01:00luisa miller
44380lastname4380firstname4380NaNNaN12021-04-22 14:51:55.432952+02:002022-04-14 11:41:33.738500+02:00NaNFalse...1556118914143802020-11-26 13:12:53+01:003False51.3False2020-12-01 20:00:00+01:00iphigenie en tauride
..................................................................
31896419095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561090839190952019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896519095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561090839190952019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896619095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561090839190952019-05-19 21:18:36+02:001False4.5False2019-05-27 20:00:00+02:00entre femmes
31896719095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561244277190952019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
31896819095lastname19095firstname190951979-07-16email1909562021-04-22 15:06:30.120537+02:002023-09-12 18:27:36.904104+02:00NaNFalse...15561244277190952019-12-31 11:04:07+01:001False5.5False2020-02-03 20:00:00+01:00a boire et a manger
\n", + "

318969 rows × 52 columns

\n", + "
" + ], + "text/plain": [ + " id lastname firstname birthdate email \\\n", + "0 405082 lastname405082 NaN NaN NaN \n", + "1 405082 lastname405082 NaN NaN NaN \n", + "2 411168 lastname411168 NaN NaN NaN \n", + "3 411168 lastname411168 NaN NaN NaN \n", + "4 4380 lastname4380 firstname4380 NaN NaN \n", + "... ... ... ... ... ... \n", + "318964 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", + "318965 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", + "318966 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", + "318967 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", + "318968 19095 lastname19095 firstname19095 1979-07-16 email19095 \n", + "\n", + " street_id created_at \\\n", + "0 6 2023-01-12 06:30:31.197484+01:00 \n", + "1 6 2023-01-12 06:30:31.197484+01:00 \n", + "2 6 2023-03-17 06:30:35.431967+01:00 \n", + "3 6 2023-03-17 06:30:35.431967+01:00 \n", + "4 1 2021-04-22 14:51:55.432952+02:00 \n", + "... ... ... \n", + "318964 6 2021-04-22 15:06:30.120537+02:00 \n", + "318965 6 2021-04-22 15:06:30.120537+02:00 \n", + "318966 6 2021-04-22 15:06:30.120537+02:00 \n", + "318967 6 2021-04-22 15:06:30.120537+02:00 \n", + "318968 6 2021-04-22 15:06:30.120537+02:00 \n", + "\n", + " updated_at civility is_partner ... \\\n", + "0 2023-01-12 06:30:31.197484+01:00 NaN False ... \n", + "1 2023-01-12 06:30:31.197484+01:00 NaN False ... \n", + "2 2023-03-17 06:30:35.431967+01:00 NaN False ... \n", + "3 2023-03-17 06:30:35.431967+01:00 NaN False ... \n", + "4 2022-04-14 11:41:33.738500+02:00 NaN False ... \n", + "... ... ... ... ... \n", + "318964 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", + "318965 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", + "318966 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", + "318967 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", + "318968 2023-09-12 18:27:36.904104+02:00 NaN False ... \n", + "\n", + " tenant_id id_x customer_id purchase_date type_of \\\n", + "0 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n", + "1 1556 992423 405082 2023-01-11 17:08:41+01:00 3 \n", + "2 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n", + "3 1556 1053934 411168 2023-03-16 16:23:10+01:00 3 \n", + "4 1556 1189141 4380 2020-11-26 13:12:53+01:00 3 \n", + "... ... ... ... ... ... \n", + "318964 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n", + "318965 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n", + "318966 1556 1090839 19095 2019-05-19 21:18:36+02:00 1 \n", + "318967 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n", + "318968 1556 1244277 19095 2019-12-31 11:04:07+01:00 1 \n", + "\n", + " is_from_subscription amount is_full_price start_date_time \\\n", + "0 False 13.0 False 2023-02-06 20:00:00+01:00 \n", + "1 False 13.0 False 2023-02-06 20:00:00+01:00 \n", + "2 False 62.0 False 2023-03-19 16:00:00+01:00 \n", + "3 False 62.0 False 2023-03-19 16:00:00+01:00 \n", + "4 False 51.3 False 2020-12-01 20:00:00+01:00 \n", + "... ... ... ... ... \n", + "318964 False 4.5 False 2019-05-27 20:00:00+02:00 \n", + "318965 False 4.5 False 2019-05-27 20:00:00+02:00 \n", + "318966 False 4.5 False 2019-05-27 20:00:00+02:00 \n", + "318967 False 5.5 False 2020-02-03 20:00:00+01:00 \n", + "318968 False 5.5 False 2020-02-03 20:00:00+01:00 \n", + "\n", + " event_name \n", + "0 zaide \n", + "1 zaide \n", + "2 luisa miller \n", + "3 luisa miller \n", + "4 iphigenie en tauride \n", + "... ... \n", + "318964 entre femmes \n", + "318965 entre femmes \n", + "318966 entre femmes \n", + "318967 a boire et a manger \n", + "318968 a boire et a manger \n", + "\n", + "[318969 rows x 52 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Jointure\n", + "merge_1 = pd.merge(purchases, tickets, left_on='id', right_on='purchase_id', how='inner')[['id_x', 'customer_id','product_id', 'purchase_date', 'type_of', 'is_from_subscription']]\n", + "merge_2 = pd.merge(products, merge_1, left_on='id', right_on='product_id', how='inner')[['id_x', 'customer_id', 'representation_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price']]\n", + "merge_3 = pd.merge(representations, merge_2, left_on='id', right_on='representation_id', how='inner')[['id_x', 'customer_id', 'event_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time']]\n", + "merge_4 = pd.merge(events, merge_3, left_on='id', right_on='event_id', how='inner')[['id_x', 'customer_id', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'name']]\n", + "merge_4 = merge_4.rename(columns={'name': 'event_name'})\n", + "df_customer_event = pd.merge(customersplus, merge_4, left_on = 'id', right_on = 'customer_id', how = 'inner')[['id_x', 'purchase_date', 'type_of', 'is_from_subscription', 'amount', 'is_full_price', 'start_date_time', 'event_name']]\n", + "df_customer_event" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, "nbformat": 4, "nbformat_minor": 5 }