{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "aa915888-cede-4eb0-8a26-7df573d29a3e", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import s3fs\n", "import warnings\n", "from datetime import date, timedelta, datetime\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "id": "17949e81-c30b-4fdf-9872-d7dc2b22ba9e", "metadata": {}, "outputs": [], "source": [ "# Import KPI construction functions\n", "#exec(open('0_KPI_functions.py').read())\n", "exec(open('../0_KPI_functions.py').read())\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "9c1737a2-bad8-4266-8dec-452085d8cfe7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv',\n", " 'projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv',\n", " 'projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv',\n", " 'projet-bdc2324-team1/0_Input/Company_10/target_information.csv']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "BUCKET = \"projet-bdc2324-team1/0_Input/Company_10\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "code", "execution_count": 4, "id": "a35dc2f6-2017-4b21-abd2-2c4c112c96b2", "metadata": {}, "outputs": [], "source": [ "dic_base=['campaigns_information','customerplus_cleaned','products_purchased_reduced','target_information']\n", "for nom_base in dic_base:\n", " FILE_PATH_S3_fanta = 'projet-bdc2324-team1/0_Input/Company_10/' + nom_base + '.csv'\n", " with fs.open(FILE_PATH_S3_fanta, mode=\"rb\") as file_in:\n", " globals()[nom_base] = pd.read_csv(file_in, sep=\",\")" ] }, { "cell_type": "code", "execution_count": 5, "id": "40b705eb-fd18-436b-b150-61611a3c6a84", "metadata": {}, "outputs": [], "source": [ "\n", "def display_databases(directory_path, file_name, datetime_col = None):\n", " \"\"\"\n", " This function returns the file from s3 storage \n", " \"\"\"\n", " file_path = \"projet-bdc2324-team1\" + \"/0_Input/Company_\" + directory_path + \"/\" + file_name + \".csv\"\n", " print(\"File path : \", file_path)\n", " with fs.open(file_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser) \n", " return df \n" ] }, { "cell_type": "code", "execution_count": 25, "id": "afd044b8-ac83-4a35-b959-700cae0b3b41", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_10/customerplus_cleaned.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_10/campaigns_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_10/products_purchased_reduced.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_10/target_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_11/customerplus_cleaned.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_11/campaigns_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_11/products_purchased_reduced.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_11/target_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_12/customerplus_cleaned.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_12/campaigns_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_12/products_purchased_reduced.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", "/tmp/ipykernel_16962/2987234667.py:8: DtypeWarning: Columns (4,8,10) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_12/target_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_13/customerplus_cleaned.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_13/campaigns_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_13/products_purchased_reduced.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_13/target_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_14/customerplus_cleaned.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_14/campaigns_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_14/products_purchased_reduced.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", "/tmp/ipykernel_16962/2987234667.py:8: DtypeWarning: Columns (8,9) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_14/target_information.csv\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_16962/2987234667.py:8: FutureWarning: The argument 'date_parser' is deprecated and will be removed in a future version. Please use 'date_format' instead, or read your data in as 'object' dtype and then call 'to_datetime'.\n", " df = pd.read_csv(file_in, sep=\",\", parse_dates = datetime_col, date_parser=custom_date_parser)\n", ":27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" ] } ], "source": [ "#creation des base de KPI aggreger pour les 5 entreprises\n", "\n", "nb_compagnie=['10','11','12','13','14']\n", "for directory_path in nb_compagnie:\n", " df_customerplus_clean_0 = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n", " df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n", " df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])\n", " df_target_information = display_databases(directory_path, file_name = \"target_information\")\n", " df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) \n", " df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced)\n", " df_customerplus_clean = customerplus_kpi_function(customerplus_clean = df_customerplus_clean_0)\n", "\n", " \n", "#creation de la colonne Number compagnie\n", " df_tickets_kpi[\"Number_compagnie\"]=int(directory_path)\n", " df_campaigns_kpi[\"Number_compagnie\"]=int(directory_path)\n", " df_customerplus_clean[\"Number_compagnie\"]=int(directory_path)\n", " df_target_information[\"Number_compagnie\"]=int(directory_path)\n", "\n", " if nb_compagnie.index(directory_path)>=1:\n", " customerplus_clean_spectacle=pd.concat([customerplus_clean_spectacle,df_customerplus_clean],axis=0)\n", " campaigns_information_spectacle=pd.concat([campaigns_information_spectacle,df_campaigns_kpi],axis=0)\n", " products_purchased_reduced_spectacle=pd.concat([products_purchased_reduced_spectacle,df_tickets_kpi],axis=0)\n", " target_information_spectacle=pd.concat([target_information_spectacle,df_target_information],axis=0)\n", " else:\n", " customerplus_clean_spectacle=df_customerplus_clean\n", " campaigns_information_spectacle=df_campaigns_kpi\n", " products_purchased_reduced_spectacle=df_tickets_kpi\n", " target_information_spectacle=df_target_information" ] }, { "cell_type": "code", "execution_count": 38, "id": "05b9a396-dcd7-4d3d-8b39-5ca48beba4b0", "metadata": {}, "outputs": [], "source": [ "#customerplus_clean_spectacle.isna().sum()\n", "#campaigns_information_spectacle.isna().sum()\n", "#products_purchased_reduced_spectacle.isna().sum()\n", "#target_information_spectacle.isna().sum()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }