completed CA projection

This commit is contained in:
Thomas PIQUE 2024-03-27 17:58:30 +00:00
parent cf0b33c940
commit d3e13f4c56
3 changed files with 1929 additions and 1192 deletions

View File

@ -4,47 +4,55 @@ from pandas import DataFrame
import numpy as np
import os
import s3fs
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.utils import class_weight
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
from sklearn.naive_bayes import GaussianNB
from scipy.optimize import fsolve
import pickle
import warnings
import io
# importation of functions defined
from utils_CA_segment import *
# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
# define type of activity
type_of_activity = "sport"
PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
# type of model for the score
type_of_model = "LogisticRegression_cv"
# load train and test sets
dataset_train, dataset_test = load_train_test(type_of_activity)
# make features - define X train and X test
X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
# choose model - logit cross validated
model = load_model(type_of_activity, type_of_model)
# create table X test segment from X test
X_test_segment = df_segment(X_test, y_test, model)
# comparison with bias of the train set - X train to be defined
X_train_score = logit_cv.predict_proba(X_train)[:, 1]
X_train_score = model.predict_proba(X_train)[:, 1]
bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),
y_objective = y_train["y_has_purchased"].sum(),
initial_guess=6)
# create a score adjusted with the bias computed
score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
X_test_segment["score_adjusted"] = score_adjusted_train
# plot adjusted scores and save (to be tested)
### 1. plot adjusted scores and save (to be tested)
plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
save_file_s3_ca("hist_score_adjusted_", type_of_activity)
"""
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
@ -53,27 +61,33 @@ FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png"
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()
"""
# comparison between score and adjusted score
### 2. comparison between score and adjusted score
X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
file_name = "table_adjusted_score"
# save table
file_name = "table_adjusted_score_"
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
X_test_table_adjusted_scores.to_csv(file_out, index = False)
# project revenue
X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=1.5, duration_projection=1)
X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=17, duration_projection=12)
# table summarizing projections
### 3. table summarizing projections (nb tickets, revenue)
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2)
file_name = "table_expected_CA"
# rename columns
mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)
# save table
file_name = "table_expected_CA_"
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
X_test_expected_CA.to_csv(file_out, index = False)

File diff suppressed because one or more lines are too long

View File

@ -1,3 +1,83 @@
# importations
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
import s3fs
import matplotlib.pyplot as plt
from scipy.optimize import fsolve
import pickle
import warnings
import io
# functions
def load_train_test(type_of_activity):
BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
File_path_train = BUCKET + "/Train_set.csv"
File_path_test = BUCKET + "/Test_set.csv"
with fs.open( File_path_train, mode="rb") as file_in:
dataset_train = pd.read_csv(file_in, sep=",")
# dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
with fs.open(File_path_test, mode="rb") as file_in:
dataset_test = pd.read_csv(file_in, sep=",")
# dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
return dataset_train, dataset_test
def features_target_split(dataset_train, dataset_test):
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
# we suppress fidelity, time between purchase, and gender other (colinearity issue)
"""
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',
'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet', 'is_email_true',
'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
"""
X_train = dataset_train[features_l]
y_train = dataset_train[['y_has_purchased']]
X_test = dataset_test[features_l]
y_test = dataset_test[['y_has_purchased']]
return X_train, X_test, y_train, y_test
def load_model(type_of_activity, model):
BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
filename = model + '.pkl'
file_path = BUCKET + filename
with fs.open(file_path, mode="rb") as f:
model_bytes = f.read()
model = pickle.loads(model_bytes)
return model
def df_segment(df, y, model) :
y_pred = model.predict(df)
y_pred_prob = model.predict_proba(df)[:, 1]
df_segment = df
df_segment["has_purchased"] = y
df_segment["has_purchased_estim"] = y_pred
df_segment["score"] = y_pred_prob
df_segment["quartile"] = np.where(df_segment['score']<0.25, '1',
np.where(df_segment['score']<0.5, '2',
np.where(df_segment['score']<0.75, '3', '4')))
return df_segment
def odd_ratio(score) :
"""
Args:
@ -152,3 +232,14 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values
return df_expected_CA
def save_file_s3_ca(File_name, type_of_activity):
image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png')
image_buffer.seek(0)
PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read())
plt.close()