# importations import pandas as pd from pandas import DataFrame import numpy as np import os import s3fs import matplotlib.pyplot as plt from scipy.optimize import fsolve import pickle import warnings import io # ignore warnings warnings.filterwarnings('ignore') # Create filesystem object S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) # importation of functions defined exec(open('utils_sales_forecast.py').read()) # from utils_CA_segment import * # define type of activity type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/" # type of model for the score type_of_model = "LogisticRegression_cv" # type_of_model = "LogisticRegression_Benchmark" # load train and test sets dataset_train, dataset_test = load_train_test(type_of_activity) # make features - define X train and X test X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test) # choose model - logit cross validated model = load_model(type_of_activity, type_of_model) # create table X test segment from X test X_test_segment = df_segment(X_test, y_test, model) # comparison with bias of the train set - X train to be defined X_train_score = model.predict_proba(X_train)[:, 1] bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), y_objective = y_train["y_has_purchased"].sum(), initial_guess=10) print("Bias estimated :", np.log(bias_train_set)) # create a score adjusted with the bias computed score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set) X_test_segment["score_adjusted"] = score_adjusted_train print("The score was successfully adjusted") MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean() MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean() print(f"MAE for score : {MAE_score}") print(f"MAE for adjusted score : {MAE_ajusted_score}") ### 1. plot adjusted scores and save (to be tested) plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity) save_file_s3_ca("hist_score_adjusted_", type_of_activity) ### 2. comparison between score and adjusted score X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index() X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]}) print(X_test_table_adjusted_scores) # save table file_name = "table_adjusted_score_" FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: X_test_table_adjusted_scores.to_csv(file_out, index = False) # project revenue X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted", duration_ref=17, duration_projection=12) ### 3. table summarizing projections (nb tickets, revenue) X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount", pace_purchase="pace_purchase"),2) # rename columns mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns} X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict) # save table file_name = "table_expected_CA_" FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: X_test_expected_CA.to_csv(file_out, index = False)