BDC-team-1/7_Sales_Forecast.py

# importations
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
import s3fs
import matplotlib.pyplot as plt
from scipy.optimize import fsolve
import pickle
import warnings
import io


# ignore warnings
warnings.filterwarnings('ignore')

# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

# importation of functions defined
exec(open('utils_sales_forecast.py').read())
# from utils_CA_segment import *

# define type of activity 
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"

# type of model for the score
# type_of_model = "LogisticRegression_cv"
type_of_model = "LogisticRegression_Benchmark"

# load train and test sets
dataset_train, dataset_test = load_train_test(type_of_activity)

# make features - define X train and X test
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)

# choose model - logit cross validated
model = load_model(type_of_activity, type_of_model)

# create table X test segment from X test
X_test_segment = df_segment(X_test, y_test, model)

# comparison with bias of the train set - X train to be defined
X_train_score = model.predict_proba(X_train)[:, 1]

bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), 
                           y_objective = y_train["y_has_purchased"].sum(),
                           initial_guess=10)
print("Bias estimated :", np.log(bias_train_set))

# create a score adjusted with the bias computed
score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
X_test_segment["score_adjusted"] = score_adjusted_train

print("The score was successfully adjusted")
MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean()
MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean()
print(f"MAE for score : {MAE_score}")
print(f"MAE for adjusted score : {MAE_ajusted_score}")

### 1. plot adjusted scores and save (to be tested)
plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
save_file_s3_ca("hist_score_adjusted_", type_of_activity)


### 2. comparison between score and adjusted score
X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})

print(X_test_table_adjusted_scores)

# save table
file_name = "table_adjusted_score_"
FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    X_test_table_adjusted_scores.to_csv(file_out, index = False)


# project revenue
X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted", 
                                     duration_ref=17, duration_projection=12)


### 3. table summarizing projections (nb tickets, revenue)
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", 
                    nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", 
                    total_amount="total_amount", pace_purchase="pace_purchase"),2)

# rename columns
mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)

# save table
file_name = "table_expected_CA_"
FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    X_test_expected_CA.to_csv(file_out, index = False)
update CA segment analysis 2024-03-23 17:23:59 +01:00			`# importations`
			`import pandas as pd`
			`from pandas import DataFrame`
			`import numpy as np`
			`import os`
			`import s3fs`
			`import matplotlib.pyplot as plt`
			`from scipy.optimize import fsolve`
			`import pickle`
			`import warnings`
added exportation to MinIo option 2024-03-24 10:42:44 +01:00			`import io`
update CA segment analysis 2024-03-23 17:23:59 +01:00
CA estimation by segment works well 2024-03-27 19:59:05 +01:00
			`# ignore warnings`
			`warnings.filterwarnings('ignore')`
completed CA projection 2024-03-27 18:58:30 +01:00
			`# Create filesystem object`
			`S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]`
			`fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})`

CA estimation by segment works well 2024-03-27 19:59:05 +01:00			`# importation of functions defined`
Changement nom et path 2024-03-31 18:54:46 +02:00			`exec(open('utils_sales_forecast.py').read())`
CA estimation by segment works well 2024-03-27 19:59:05 +01:00			`# from utils_CA_segment import *`

update CA segment analysis 2024-03-23 17:23:59 +01:00			`# define type of activity`
CA estimation by segment works well 2024-03-27 19:59:05 +01:00			`type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')`
update CA segment analysis 2024-03-23 17:23:59 +01:00			`PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"`

completed CA projection 2024-03-27 18:58:30 +01:00			`# type of model for the score`
take new databases as input 2024-03-30 12:00:49 +01:00			`# type_of_model = "LogisticRegression_cv"`
			`type_of_model = "LogisticRegression_Benchmark"`
completed CA projection 2024-03-27 18:58:30 +01:00
			`# load train and test sets`
			`dataset_train, dataset_test = load_train_test(type_of_activity)`

			`# make features - define X train and X test`
			`X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)`

			`# choose model - logit cross validated`
			`model = load_model(type_of_activity, type_of_model)`

			`# create table X test segment from X test`
			`X_test_segment = df_segment(X_test, y_test, model)`

update CA segment analysis 2024-03-23 17:23:59 +01:00			`# comparison with bias of the train set - X train to be defined`
completed CA projection 2024-03-27 18:58:30 +01:00			`X_train_score = model.predict_proba(X_train)[:, 1]`
update CA segment analysis 2024-03-23 17:23:59 +01:00
			`bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),`
			`y_objective = y_train["y_has_purchased"].sum(),`
CA estimation by segment works well 2024-03-27 19:59:05 +01:00			`initial_guess=10)`
			`print("Bias estimated :", np.log(bias_train_set))`
update CA segment analysis 2024-03-23 17:23:59 +01:00
completed CA projection 2024-03-27 18:58:30 +01:00			`# create a score adjusted with the bias computed`
update CA segment analysis 2024-03-23 17:23:59 +01:00			`score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)`
			`X_test_segment["score_adjusted"] = score_adjusted_train`

CA estimation by segment works well 2024-03-27 19:59:05 +01:00			`print("The score was successfully adjusted")`
			`MAE_score = abs(X_test_segment["score"]-X_test_segment["has_purchased"]).mean()`
			`MAE_ajusted_score = abs(X_test_segment["score_adjusted"]-X_test_segment["has_purchased"]).mean()`
			`print(f"MAE for score : {MAE_score}")`
			`print(f"MAE for adjusted score : {MAE_ajusted_score}")`
update CA segment analysis 2024-03-23 17:23:59 +01:00
completed CA projection 2024-03-27 18:58:30 +01:00			`### 1. plot adjusted scores and save (to be tested)`
added exportation to MinIo option 2024-03-24 10:42:44 +01:00			`plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)`
completed CA projection 2024-03-27 18:58:30 +01:00			`save_file_s3_ca("hist_score_adjusted_", type_of_activity)`
update CA segment analysis 2024-03-23 17:23:59 +01:00

completed CA projection 2024-03-27 18:58:30 +01:00			`### 2. comparison between score and adjusted score`
added exportation to MinIo option 2024-03-24 10:42:44 +01:00			`X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()`
			`X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})`
update CA segment analysis 2024-03-23 17:23:59 +01:00
take new databases as input 2024-03-30 12:00:49 +01:00			`print(X_test_table_adjusted_scores)`

completed CA projection 2024-03-27 18:58:30 +01:00			`# save table`
			`file_name = "table_adjusted_score_"`
update CA segment analysis 2024-03-23 17:23:59 +01:00			`FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"`
			`with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:`
			`X_test_table_adjusted_scores.to_csv(file_out, index = False)`


			`# project revenue`
CA estimation by segment works well 2024-03-27 19:59:05 +01:00			`X_test_segment = project_tickets_CA (X_test_segment, "nb_purchases", "nb_tickets", "total_amount", "score_adjusted",`
			`duration_ref=17, duration_projection=12)`
update CA segment analysis 2024-03-23 17:23:59 +01:00

completed CA projection 2024-03-27 18:58:30 +01:00			`### 3. table summarizing projections (nb tickets, revenue)`
CA estimation by segment works well 2024-03-27 19:59:05 +01:00			`X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile",`
			`nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected",`
			`total_amount="total_amount", pace_purchase="pace_purchase"),2)`
update CA segment analysis 2024-03-23 17:23:59 +01:00
completed CA projection 2024-03-27 18:58:30 +01:00			`# rename columns`
			`mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}`
			`X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)`

			`# save table`
			`file_name = "table_expected_CA_"`
update CA segment analysis 2024-03-23 17:23:59 +01:00			`FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"`
			`with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:`
			`X_test_expected_CA.to_csv(file_out, index = False)`