BDC-team-1/Spectacle/Stat_desc.ipynb
2024-03-02 11:16:24 +00:00

410 lines
14 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "aa915888-cede-4eb0-8a26-7df573d29a3e",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import s3fs\n",
"import warnings\n",
"from datetime import date, timedelta, datetime\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "17949e81-c30b-4fdf-9872-d7dc2b22ba9e",
"metadata": {},
"outputs": [],
"source": [
"# Import KPI construction functions\n",
"#exec(open('0_KPI_functions.py').read())\n",
"exec(open('../0_KPI_functions.py').read())\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9c1737a2-bad8-4266-8dec-452085d8cfe7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv',\n",
" 'projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv',\n",
" 'projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv',\n",
" 'projet-bdc2324-team1/0_Input/Company_10/target_information.csv']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create filesystem object\n",
"S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n",
"fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n",
"\n",
"BUCKET = \"projet-bdc2324-team1/0_Input/Company_10\"\n",
"fs.ls(BUCKET)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "a35dc2f6-2017-4b21-abd2-2c4c112c96b2",
"metadata": {},
"outputs": [],
"source": [
"dic_base=['campaigns_information','customerplus_cleaned','products_purchased_reduced','target_information']\n",
"for nom_base in dic_base:\n",
" FILE_PATH_S3_fanta = 'projet-bdc2324-team1/0_Input/Company_10/' + nom_base + '.csv'\n",
" with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n",
" globals()[nom_base] = pd.read_csv(file_in, sep=\",\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "40b705eb-fd18-436b-b150-61611a3c6a84",
"metadata": {},
"outputs": [],
"source": [
"\n",
"def display_databases(directory_path, file_name, datetime_col = None):\n",
" \"\"\"\n",
" This function returns the file from s3 storage \n",
" \"\"\"\n",
" file_path = \"projet-bdc2324-team1\" + \"/0_Input/Company_\" + directory_path + \"/\" + file_name + \".csv\"\n",
" print(\"File path : \", file_path)\n",
" with fs.open(file_path, mode=\"rb\") as file_in:\n",
" df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser) \n",
" return df \n"
]
},
{
"cell_type": "markdown",
"id": "e22eb500-80da-4dd9-8b20-9e4deec64831",
"metadata": {},
"source": [
"nb_compagnie=['10','11','12','13','14']\n",
"for directory_path in nb_compagnie:\n",
" df_customerplus_clean = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
" df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
" df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n",
" df_customerplus_clean[\"Number_compagnie\"]=int(directory_path)\n",
" df_campaigns_information[\"Number_compagnie\"]=int(directory_path)\n",
" df_products_purchased_reduced[\"Number_compagnie\"]=int(directory_path)\n",
"\n",
" if nb_compagnie.index(directory_path)>=1:\n",
" customerplus_clean_spectacle=pd.concat([customerplus_clean_spectacle,df_customerplus_clean],axis=0)\n",
" campaigns_information_spectacle=pd.concat([campaigns_information_spectacle,df_campaigns_information],axis=0)\n",
" products_purchased_reduced_spectacle=pd.concat([products_purchased_reduced_spectacle,df_products_purchased_reduced],axis=0)\n",
" else:\n",
" customerplus_clean_spectacle=df_customerplus_clean\n",
" campaigns_information_spectacle=df_campaigns_information\n",
" products_purchased_reduced_spectacle=df_products_purchased_reduced\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "1c4a07ec-cf8e-420e-88f0-5eb4b83e2bc1",
"metadata": {},
"outputs": [],
"source": [
"directory_path='10'"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "4249cdcd-ef59-4dd6-a345-4218bb90e526",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1282/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
" df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1282/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
" df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_1282/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n",
" df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n"
]
}
],
"source": [
" # Import customerplus\n",
"df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n",
"df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n",
"df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e56aa16f-a167-4bff-9f8b-f764d1f28ebd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<string>:27: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
]
}
],
"source": [
" # Creation des KPI\n",
" # KPI sur les campagnes publicitaires\n",
"df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n",
"\n",
" # KPI sur le comportement d'achat\n",
"df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n",
"\n",
" # KPI sur les données socio-démographiques\n",
"df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n",
"\n",
"#creation de la colonne Number compagnie\n",
"df_tickets_kpi[\"Number_compagnie\"]=int(directory_path)\n",
"df_campaigns_kpi[\"Number_compagnie\"]=int(directory_path)\n",
"df_customerplus_clean[\"Number_compagnie\"]=int(directory_path)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "838722c3-4c78-4ffa-a6b6-01ac60f4bdbd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" <th>Number_compagnie</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>29</td>\n",
" <td>4</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>37</td>\n",
" <td>3</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>39</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>0 days 05:16:38</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>41</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>0 days 01:12:29</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>44</td>\n",
" <td>4</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57138</th>\n",
" <td>827940</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57139</th>\n",
" <td>827941</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57140</th>\n",
" <td>827942</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57141</th>\n",
" <td>827943</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57142</th>\n",
" <td>827944</td>\n",
" <td>1</td>\n",
" <td>0.0</td>\n",
" <td>NaT</td>\n",
" <td>10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>57143 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" customer_id nb_campaigns nb_campaigns_opened time_to_open \\\n",
"0 29 4 0.0 NaT \n",
"1 37 3 0.0 NaT \n",
"2 39 4 1.0 0 days 05:16:38 \n",
"3 41 4 1.0 0 days 01:12:29 \n",
"4 44 4 0.0 NaT \n",
"... ... ... ... ... \n",
"57138 827940 1 0.0 NaT \n",
"57139 827941 1 0.0 NaT \n",
"57140 827942 1 0.0 NaT \n",
"57141 827943 1 0.0 NaT \n",
"57142 827944 1 0.0 NaT \n",
"\n",
" Number_compagnie \n",
"0 10 \n",
"1 10 \n",
"2 10 \n",
"3 10 \n",
"4 10 \n",
"... ... \n",
"57138 10 \n",
"57139 10 \n",
"57140 10 \n",
"57141 10 \n",
"57142 10 \n",
"\n",
"[57143 rows x 5 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_campaigns_kpi"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}