BDC-team-1/7_Sales_Forecast.py
2024-03-31 17:16:46 +00:00

101 lines
3.9 KiB
Python

# importations
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
import s3fs
import matplotlib.pyplot as plt
from scipy.optimize import fsolve
import pickle
import warnings
import io
# ignore warnings
warnings.filterwarnings('ignore')
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# importation of functions defined
exec(open('utils_sales_forecast.py').read())
# from utils_CA_segment import *
# define type of activity
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/"
# type of model for the score
# type_of_model = "LogisticRegression_cv"
type_of_model = "LogisticRegression_Benchmark"
# load train and test sets
dataset_train, dataset_test = load_train_test(type_of_activity)
# make features - define X train and X test
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
# choose model - logit cross validated
model = load_model(type_of_activity, type_of_model)
# create table X test segment from X test
X_test_segment = df_segment(X_test, y_test, model)
# comparison with bias of the train set - X train to be defined
X_train_score = model.predict_proba(X_train)[:, 1]
bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),
y_objective = y_train["y_has_purchased"].sum(),
initial_guess=10)
print("Bias estimated :", np.log(bias_train_set))
# create a score adjusted with the bias computed
score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
X_test_segment["score_adjusted"] = score_adjusted_train
print("The score was successfully adjusted")
MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean()
MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean()
print(f"MAE for score : {MAE_score}")
print(f"MAE for adjusted score : {MAE_ajusted_score}")
### 1. plot adjusted scores and save (to be tested)
plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
save_file_s3_ca("hist_score_adjusted_", type_of_activity)
### 2. comparison between score and adjusted score
X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
print(X_test_table_adjusted_scores)
# save table
file_name = "table_adjusted_score_"
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
X_test_table_adjusted_scores.to_csv(file_out, index = False)
# project revenue
X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted",
duration_ref=17, duration_projection=12)
### 3. table summarizing projections (nb tickets, revenue)
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",
nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",
total_amount="total_amount", pace_purchase="pace_purchase"),2)
# rename columns
mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)
# save table
file_name = "table_expected_CA_"
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
X_test_expected_CA.to_csv(file_out, index = False)