diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index 5077370..e77968c 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -1529,7 +1529,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "id": "98f78cd5-b694-4cc6-b033-20170aa13e8d", "metadata": {}, "outputs": [], @@ -1559,7 +1559,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "id": "e2c88552-b863-47a2-be23-8d2898fb28bc", "metadata": {}, "outputs": [], @@ -1593,7 +1593,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "id": "24537647-bc29-4777-9848-ac4120a4aa60", "metadata": {}, "outputs": [], @@ -1603,7 +1603,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "id": "6be2a9a6-056b-4e19-8c26-a18ba3df36b3", "metadata": {}, "outputs": [ @@ -1639,7 +1639,7 @@ " 0\n", " 2\n", " 4\n", - " 0.0\n", + " NaN\n", " NaT\n", " \n", " \n", @@ -1660,14 +1660,14 @@ " 3\n", " 5\n", " 4\n", - " 0.0\n", + " NaN\n", " NaT\n", " \n", " \n", " 4\n", " 6\n", " 20\n", - " 0.0\n", + " NaN\n", " NaT\n", " \n", " \n", @@ -1676,14 +1676,14 @@ ], "text/plain": [ " customer_id nb_campaigns nb_campaigns_opened time_to_open\n", - "0 2 4 0.0 NaT\n", + "0 2 4 NaN NaT\n", "1 3 222 124.0 1 days 00:28:30.169354838\n", "2 4 7 7.0 1 days 04:31:01.428571428\n", - "3 5 4 0.0 NaT\n", - "4 6 20 0.0 NaT" + "3 5 4 NaN NaT\n", + "4 6 20 NaN NaT" ] }, - "execution_count": 29, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1702,7 +1702,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 29, "id": "043303fe-e90f-4689-a2a9-5d690555a045", "metadata": {}, "outputs": [], @@ -1765,7 +1765,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 30, "id": "5882234a-1ed5-4269-87a6-0d75613476e3", "metadata": {}, "outputs": [], @@ -1775,7 +1775,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 31, "id": "5f2046cf-ffde-4521-91e7-b727b8bc17f5", "metadata": {}, "outputs": [ @@ -1811,6 +1811,8 @@ " purchase_date_max\n", " time_between_purchase\n", " nb_tickets_internet\n", + " name_event_types\n", + " avg_amount\n", " \n", " \n", " \n", @@ -1827,6 +1829,8 @@ " 4.179306\n", " 3258.011562\n", " 51.0\n", + " offre muséale individuel\n", + " 6.150659\n", " \n", " \n", " 1\n", @@ -1841,6 +1845,8 @@ " 5.221840\n", " 3692.976389\n", " 2988.0\n", + " spectacle vivant\n", + " 7.762474\n", " \n", " \n", " 2\n", @@ -1855,6 +1861,8 @@ " 0.146331\n", " 3803.223461\n", " 9.0\n", + " offre muséale groupe\n", + " 4.452618\n", " \n", " \n", " 3\n", @@ -1869,6 +1877,8 @@ " 1408.715532\n", " 1093.999977\n", " 5.0\n", + " formule adhésion\n", + " 6.439463\n", " \n", " \n", " 4\n", @@ -1883,6 +1893,8 @@ " 1340.308160\n", " 700.966389\n", " 0.0\n", + " offre muséale individuel\n", + " 6.150659\n", " \n", " \n", "\n", @@ -1903,15 +1915,22 @@ "3 5 1 2502.715509 1408.715532 \n", "4 1 0 2041.274549 1340.308160 \n", "\n", - " time_between_purchase nb_tickets_internet \n", - "0 3258.011562 51.0 \n", - "1 3692.976389 2988.0 \n", - "2 3803.223461 9.0 \n", - "3 1093.999977 5.0 \n", - "4 700.966389 0.0 " + " time_between_purchase nb_tickets_internet name_event_types \\\n", + "0 3258.011562 51.0 offre muséale individuel \n", + "1 3692.976389 2988.0 spectacle vivant \n", + "2 3803.223461 9.0 offre muséale groupe \n", + "3 1093.999977 5.0 formule adhésion \n", + "4 700.966389 0.0 offre muséale individuel \n", + "\n", + " avg_amount \n", + "0 6.150659 \n", + "1 7.762474 \n", + "2 4.452618 \n", + "3 6.439463 \n", + "4 6.150659 " ] }, - "execution_count": 36, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1922,7 +1941,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 32, "id": "a4a2311d-8a72-4030-afd5-218004d5d2a5", "metadata": {}, "outputs": [], @@ -1946,7 +1965,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 33, "id": "83230baa-9a8a-4614-b629-e99c2505c696", "metadata": {}, "outputs": [ @@ -2159,7 +2178,7 @@ "[5 rows x 37 columns]" ] }, - "execution_count": 43, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -2173,7 +2192,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 34, "id": "433921de-03ad-4024-9462-ecd267db1756", "metadata": {}, "outputs": [ @@ -2339,7 +2358,7 @@ " formule adhésion\n", " 6.439463\n", " 4.0\n", - " 0.0\n", + " NaN\n", " NaT\n", " \n", " \n", @@ -2381,12 +2400,12 @@ "1 NaN NaT \n", "2 NaN NaT \n", "3 NaN NaT \n", - "4 0.0 NaT \n", + "4 NaN NaT \n", "\n", "[5 rows x 40 columns]" ] }, - "execution_count": 44, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -2400,7 +2419,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 35, "id": "25e54131-6835-4e94-86d3-1a78520ed7bc", "metadata": {}, "outputs": [], @@ -2426,7 +2445,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 36, "id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2", "metadata": {}, "outputs": [], @@ -2468,7 +2487,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 38, "id": "46de1912-4a66-46e5-8b9e-7768b2d2723b", "metadata": {}, "outputs": [], @@ -2482,7 +2501,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 39, "id": "d53825e4-6453-45bc-94f2-7b2504ec4afb", "metadata": {}, "outputs": [ @@ -2688,7 +2707,7 @@ "[5 rows x 28 columns]" ] }, - "execution_count": 64, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -2699,7 +2718,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 40, "id": "1e42a790-b215-4107-a969-85005da06ebd", "metadata": {}, "outputs": [], @@ -2713,28 +2732,394 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 41, "id": "d950f24d-a5d1-4f1e-aeaa-ca826470365f", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idevent_type_idnb_ticketsnb_purchasestotal_amountnb_suppliersvente_internet_maxpurchase_date_minpurchase_date_maxtime_between_purchase...average_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
012.0384226.0194790.02686540.57.01.03262.1908684.1793063258.011562...1.9560878821221.5641472.02013-06-10 10:37:58+00:00frNaN1311.00.00.0NaT
114.0453242.0228945.03248965.56.01.03698.1982295.2218403692.976389...1.9560878821221.5641472.02013-06-10 10:37:58+00:00frNaN1311.00.00.0NaT
215.0201750.0107110.01459190.06.01.03803.3697920.1463313803.223461...1.9560878821221.5641472.02013-06-10 10:37:58+00:00frNaN1311.00.00.0NaT
316.0217356.0111786.01435871.55.01.02502.7155091408.7155321093.999977...1.9560878821221.5641472.02013-06-10 10:37:58+00:00frNaN1311.00.00.0NaT
422.0143.0143.00.01.00.02041.2745491340.308160700.966389...1.0000000.0307.02018-04-07 12:55:07+00:00frNaN1311.04.00.0NaT
..................................................................
15629112561335.03.01.033.01.01.00.1105210.1105210.000000...NaNNaNNaNNaTNaNNaNNaNNaNNaNNaT
15629212561345.04.01.044.01.01.00.0920950.0920950.000000...NaNNaNNaNNaTNaNNaNNaNNaNNaNNaT
15629312561355.01.01.011.01.01.00.0878940.0878940.000000...NaNNaNNaNNaTNaNNaNNaNNaNNaNNaT
15629412561365.02.01.022.01.01.00.0403940.0403940.000000...NaNNaNNaNNaTNaNNaNNaNNaNNaNNaT
15629512561375.02.01.022.01.01.00.0000000.0000000.000000...NaNNaNNaNNaTNaNNaNNaNNaNNaNNaT
\n", + "

156296 rows × 40 columns

\n", + "
" + ], "text/plain": [ - "Index(['customer_id', 'event_type_id', 'nb_tickets', 'nb_purchases',\n", - " 'total_amount', 'nb_suppliers', 'vente_internet_max',\n", - " 'purchase_date_min', 'purchase_date_max', 'time_between_purchase',\n", - " 'nb_tickets_internet', 'name_event_types', 'avg_amount', 'birthdate',\n", - " 'street_id', 'is_partner', 'gender', 'is_email_true', 'opt_in',\n", - " 'structure_id', 'profession', 'language', 'mcp_contact_id',\n", - " 'last_buying_date', 'max_price', 'ticket_sum', 'average_price',\n", - " 'fidelity', 'average_purchase_delay', 'average_price_basket',\n", - " 'average_ticket_basket', 'total_price', 'purchase_count',\n", - " 'first_buying_date', 'country', 'age', 'tenant_id', 'nb_campaigns',\n", - " 'nb_campaigns_opened', 'time_to_open'],\n", - " dtype='object')" + " customer_id event_type_id nb_tickets nb_purchases total_amount \\\n", + "0 1 2.0 384226.0 194790.0 2686540.5 \n", + "1 1 4.0 453242.0 228945.0 3248965.5 \n", + "2 1 5.0 201750.0 107110.0 1459190.0 \n", + "3 1 6.0 217356.0 111786.0 1435871.5 \n", + "4 2 2.0 143.0 143.0 0.0 \n", + "... ... ... ... ... ... \n", + "156291 1256133 5.0 3.0 1.0 33.0 \n", + "156292 1256134 5.0 4.0 1.0 44.0 \n", + "156293 1256135 5.0 1.0 1.0 11.0 \n", + "156294 1256136 5.0 2.0 1.0 22.0 \n", + "156295 1256137 5.0 2.0 1.0 22.0 \n", + "\n", + " nb_suppliers vente_internet_max purchase_date_min \\\n", + "0 7.0 1.0 3262.190868 \n", + "1 6.0 1.0 3698.198229 \n", + "2 6.0 1.0 3803.369792 \n", + "3 5.0 1.0 2502.715509 \n", + "4 1.0 0.0 2041.274549 \n", + "... ... ... ... \n", + "156291 1.0 1.0 0.110521 \n", + "156292 1.0 1.0 0.092095 \n", + "156293 1.0 1.0 0.087894 \n", + "156294 1.0 1.0 0.040394 \n", + "156295 1.0 1.0 0.000000 \n", + "\n", + " purchase_date_max time_between_purchase ... average_ticket_basket \\\n", + "0 4.179306 3258.011562 ... 1.956087 \n", + "1 5.221840 3692.976389 ... 1.956087 \n", + "2 0.146331 3803.223461 ... 1.956087 \n", + "3 1408.715532 1093.999977 ... 1.956087 \n", + "4 1340.308160 700.966389 ... 1.000000 \n", + "... ... ... ... ... \n", + "156291 0.110521 0.000000 ... NaN \n", + "156292 0.092095 0.000000 ... NaN \n", + "156293 0.087894 0.000000 ... NaN \n", + "156294 0.040394 0.000000 ... NaN \n", + "156295 0.000000 0.000000 ... NaN \n", + "\n", + " total_price purchase_count first_buying_date country age \\\n", + "0 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n", + "1 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n", + "2 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n", + "3 8821221.5 641472.0 2013-06-10 10:37:58+00:00 fr NaN \n", + "4 0.0 307.0 2018-04-07 12:55:07+00:00 fr NaN \n", + "... ... ... ... ... .. \n", + "156291 NaN NaN NaT NaN NaN \n", + "156292 NaN NaN NaT NaN NaN \n", + "156293 NaN NaN NaT NaN NaN \n", + "156294 NaN NaN NaT NaN NaN \n", + "156295 NaN NaN NaT NaN NaN \n", + "\n", + " tenant_id nb_campaigns nb_campaigns_opened time_to_open \n", + "0 1311.0 0.0 0.0 NaT \n", + "1 1311.0 0.0 0.0 NaT \n", + "2 1311.0 0.0 0.0 NaT \n", + "3 1311.0 0.0 0.0 NaT \n", + "4 1311.0 4.0 0.0 NaT \n", + "... ... ... ... ... \n", + "156291 NaN NaN NaN NaT \n", + "156292 NaN NaN NaN NaT \n", + "156293 NaN NaN NaN NaT \n", + "156294 NaN NaN NaN NaT \n", + "156295 NaN NaN NaN NaT \n", + "\n", + "[156296 rows x 40 columns]" ] }, - "execution_count": 66, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -2745,7 +3130,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 42, "id": "ebf6d843-dcc0-4e83-b063-94806c0bac17", "metadata": {}, "outputs": [], @@ -2778,7 +3163,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.6" } }, "nbformat": 4, diff --git a/0_Cleaning_and_merge.py b/0_Cleaning_and_merge.py index f461547..55fd043 100644 --- a/0_Cleaning_and_merge.py +++ b/0_Cleaning_and_merge.py @@ -34,6 +34,8 @@ for i in range(len(liste_database)) : nom_dataframe = df_prefix + re.search(r'\/(\d+)\/(\d+)([a-zA-Z_]+)\.csv$', current_path).group(3) globals()[nom_dataframe] = df +## 1 - Cleaning of the datasets + # Cleaning customerplus df1_customerplus_clean = preprocessing_customerplus(df1_customersplus) @@ -61,29 +63,91 @@ df1_products_purchased = pd.merge(df1_ticket_information, products_global, left_ # Selection des variables d'intérêts df1_products_purchased_reduced = df1_products_purchased[['ticket_id', 'customer_id', 'purchase_id' ,'event_type_id', 'supplier_name', 'purchase_date', 'type_of_ticket_name', 'amount', 'children', 'is_full_price', 'name_event_types', 'name_facilities', 'name_categories', 'name_events', 'name_seasons']] -# Fusion de l'ensemble et creation des KPI -df1_campaigns_kpi = campaigns_kpi_function(campaigns_information = df1_campaigns_information) +## 2 - Construction of KPIs on a given period -df1_tickets_kpi = tickets_kpi_function(tickets_information = df1_products_purchased_reduced) +def explanatory_variables(min_date = "2021-09-01", max_date = "2023-09-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean): -# Fusion avec KPI liés au customer -df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left') + # Filtre de cohérence pour la mise en pratique de notre méthode + max_date = pd.to_datetime(max_date, utc = True, format = 'ISO8601') + min_date = pd.to_datetime(min_date, utc = True, format = 'ISO8601') -# Fill NaN values -df1_customer[['nb_campaigns', 'nb_campaigns_opened']] = df1_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0) + #Filtre de la base df_campaigns_information + df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= max_date) & (df_campaigns_information['sent_at'] >= min_date)] + df_campaigns_information['opened_at'][df_campaigns_information['opened_at'] >= max_date] = np.datetime64('NaT') + + #Filtre de la base df_products_purchased_reduced + df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)] -# Fusion avec KPI liés au comportement d'achat -df1_customer_product = pd.merge(df1_tickets_kpi, df1_customer, on = 'customer_id', how = 'outer') + print("Data filtering : SUCCESS") + + # Fusion de l'ensemble et creation des KPI + df_campaigns_kpi = campaigns_kpi_function(campaigns_information = df_campaigns_information) + df_tickets_kpi = tickets_kpi_function(tickets_information = df_products_purchased_reduced) -# Fill NaN values -df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df1_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0) + print("KPIs construction : SUCCESS") + # Fusion avec KPI liés au customer + df_customer = pd.merge(df_customerplus_clean, df_campaigns_kpi, on = 'customer_id', how = 'left') + + # Fill NaN values + df_customer[['nb_campaigns', 'nb_campaigns_opened']] = df_customer[['nb_campaigns', 'nb_campaigns_opened']].fillna(0) + + # Fusion avec KPI liés au comportement d'achat + df_customer_product = pd.merge(df_tickets_kpi, df_customer, on = 'customer_id', how = 'outer') + + # Fill NaN values + df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']] = df_customer_product[['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'nb_tickets_internet']].fillna(0) + + return df_customer_product + +# Fonction pour créer les variables expliquée +def explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced): + + # Filtrer la base d'achat + df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= max_date) & (df_products_purchased_reduced['purchase_date'] > min_date)] + + # Indicatrice d'achat + df_products_purchased_reduced['y_has_purchased'] = 1 + + y = df_products_purchased_reduced[['customer_id', 'event_type_id', 'y_has_purchased']].drop_duplicates() + + return y ## Exportation -# Exportation vers 'projet-bdc2324-team1' -BUCKET_OUT = "projet-bdc2324-team1" -FILE_KEY_OUT_S3 = "1_Output/Company 1 - Segmentation base.csv" +# Dossier d'exportation +BUCKET_OUT = "projet-bdc2324-team1/1_Output/Logistique Regression databases - First approach" + +X_test = explanatory_variables(min_date = "2021-08-01", max_date = "2023-08-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean) + +y_test = explained_variable(min_date = "2023-08-01", max_date = "2023-11-01", df_products_purchased_reduced = df1_products_purchased_reduced) + +dataset_test = pd.merge(X_test, y_test, on = ['customer_id', 'event_type_id'], how = 'left') + +FILE_KEY_OUT_S3 = "dataset_test.csv" FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: - df1_customer_product.to_csv(file_out, index = False) + dataset_test.to_csv(file_out, index = False) + +print("Exportation dataset test : SUCCESS") + +X_train = explanatory_variables(min_date = "2021-05-01", max_date = "2023-05-01", df_campaigns_information = df1_campaigns_information, df_products_purchased_reduced = df1_products_purchased_reduced, df_customerplus_clean = df1_customerplus_clean) + +y_train = explained_variable(min_date = "2023-05-01", max_date = "2023-08-01", df_products_purchased_reduced = df1_products_purchased_reduced) + +dataset_train = pd.merge(X_train, y_train, on = ['customer_id', 'event_type_id'], how = 'left') + +FILE_KEY_OUT_S3 = "dataset_train.csv" +FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3 + +with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: + dataset_test.to_csv(file_out, index = False) + +print("Exportation dataset train : SUCCESS") + + + +# # Exportation vers 'projet-bdc2324-team1' + + +print("Exportation base de la base X d'entraînement : SUCCESS") diff --git a/0_KPI_functions.py b/0_KPI_functions.py index d79638a..69a5294 100644 --- a/0_KPI_functions.py +++ b/0_KPI_functions.py @@ -5,7 +5,7 @@ def campaigns_kpi_function(campaigns_information = None): nb_campaigns = campaigns_information[['customer_id', 'campaign_name']].groupby('customer_id').count().reset_index() nb_campaigns.rename(columns = {'campaign_name' : 'nb_campaigns'}, inplace = True) # Temps d'ouverture en min moyen - campaigns_information['time_to_open'] = campaigns_information['opened_at'] - campaigns_information['delivered_at'] + campaigns_information['time_to_open'] = pd.to_datetime(campaigns_information['opened_at'], utc = True, format = 'ISO8601') - pd.to_datetime(campaigns_information['delivered_at'], utc = True, format = 'ISO8601') time_to_open = campaigns_information[['customer_id', 'time_to_open']].groupby('customer_id').mean().reset_index() # Nombre de mail ouvert @@ -57,7 +57,7 @@ def tickets_kpi_function(tickets_information = None): tickets_kpi.columns = tickets_kpi.columns.map('_'.join) - tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', + tickets_kpi.rename(columns = {'ticket_id_count' : 'nb_tickets', 'purchase_id_nunique' : 'nb_purchases', 'amount_sum' : 'total_amount', 'supplier_name_nunique' : 'nb_suppliers',