BDC-team-1/7_Sales_Forecast.py

101 lines
3.9 KiB
Python
Raw Normal View History

2024-03-23 17:23:59 +01:00
# importations
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
import s3fs
import matplotlib.pyplot as plt
from scipy.optimize import fsolve
import pickle
import warnings
2024-03-24 10:42:44 +01:00
import io
2024-03-23 17:23:59 +01:00
2024-03-27 19:59:05 +01:00
# ignore warnings
warnings.filterwarnings('ignore')
2024-03-27 18:58:30 +01:00
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
2024-03-27 19:59:05 +01:00
# importation of functions defined
2024-03-31 18:54:46 +02:00
exec(open('utils_sales_forecast.py').read())
2024-03-27 19:59:05 +01:00
# from utils_CA_segment import *
2024-03-23 17:23:59 +01:00
# define type of activity
2024-03-27 19:59:05 +01:00
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
2024-03-23 17:23:59 +01:00
PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
2024-03-27 18:58:30 +01:00
# type of model for the score
2024-03-30 12:00:49 +01:00
# type_of_model = "LogisticRegression_cv"
type_of_model = "LogisticRegression_Benchmark"
2024-03-27 18:58:30 +01:00
# load train and test sets
dataset_train, dataset_test = load_train_test(type_of_activity)
# make features - define X train and X test
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
# choose model - logit cross validated
model = load_model(type_of_activity, type_of_model)
# create table X test segment from X test
X_test_segment = df_segment(X_test, y_test, model)
2024-03-23 17:23:59 +01:00
# comparison with bias of the train set - X train to be defined
2024-03-27 18:58:30 +01:00
X_train_score = model.predict_proba(X_train)[:, 1]
2024-03-23 17:23:59 +01:00
bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),
y_objective = y_train["y_has_purchased"].sum(),
2024-03-27 19:59:05 +01:00
initial_guess=10)
print("Bias estimated :", np.log(bias_train_set))
2024-03-23 17:23:59 +01:00
2024-03-27 18:58:30 +01:00
# create a score adjusted with the bias computed
2024-03-23 17:23:59 +01:00
score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
X_test_segment["score_adjusted"] = score_adjusted_train
2024-03-27 19:59:05 +01:00
print("The score was successfully adjusted")
MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean()
MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean()
print(f"MAE for score : {MAE_score}")
print(f"MAE for adjusted score : {MAE_ajusted_score}")
2024-03-23 17:23:59 +01:00
2024-03-27 18:58:30 +01:00
### 1. plot adjusted scores and save (to be tested)
2024-03-24 10:42:44 +01:00
plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
2024-03-27 18:58:30 +01:00
save_file_s3_ca("hist_score_adjusted_", type_of_activity)
2024-03-23 17:23:59 +01:00
2024-03-27 18:58:30 +01:00
### 2. comparison between score and adjusted score
2024-03-24 10:42:44 +01:00
X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
2024-03-23 17:23:59 +01:00
2024-03-30 12:00:49 +01:00
print(X_test_table_adjusted_scores)
2024-03-27 18:58:30 +01:00
# save table
file_name = "table_adjusted_score_"
2024-03-23 17:23:59 +01:00
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
X_test_table_adjusted_scores.to_csv(file_out, index = False)
# project revenue
2024-03-27 19:59:05 +01:00
X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted",
duration_ref=17, duration_projection=12)
2024-03-23 17:23:59 +01:00
2024-03-27 18:58:30 +01:00
### 3. table summarizing projections (nb tickets, revenue)
2024-03-27 19:59:05 +01:00
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",
nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",
total_amount="total_amount", pace_purchase="pace_purchase"),2)
2024-03-23 17:23:59 +01:00
2024-03-27 18:58:30 +01:00
# rename columns
mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)
# save table
file_name = "table_expected_CA_"
2024-03-23 17:23:59 +01:00
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
X_test_expected_CA.to_csv(file_out, index = False)