completed CA projection

2024-03-27 17:58:30 +00:00 · 2024-03-27 17:58:30 +00:00 · d3e13f4c56
commit d3e13f4c56
parent cf0b33c940
3 changed files with 1929 additions and 1192 deletions
--- a/0_7_CA_segment.py
+++ b/0_7_CA_segment.py
@ -4,47 +4,55 @@ from pandas import DataFrame
 import numpy as np
 import os
 import s3fs
 import re
 from sklearn.linear_model import LogisticRegression
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
 from sklearn.utils import class_weight
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.pipeline import Pipeline
 from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import GridSearchCV
 from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
 import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
 from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
 from sklearn.naive_bayes import GaussianNB
 from scipy.optimize import fsolve
 import pickle
 import warnings
 import io
 # importation of functions defined
 from utils_CA_segment import *
 # Create filesystem object
 S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 # define type of activity 
 type_of_activity = "sport"
 PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
 # type of model for the score
 type_of_model = "LogisticRegression_cv"
 # load train and test sets
 dataset_train, dataset_test = load_train_test(type_of_activity)
 # make features - define X train and X test
 X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
 # choose model - logit cross validated
 model = load_model(type_of_activity, type_of_model)
 # create table X test segment from X test
 X_test_segment = df_segment(X_test, y_test, model)
 # comparison with bias of the train set - X train to be defined
-X_train_score = logit_cv.predict_proba(X_train)[:, 1]
+X_train_score = model.predict_proba(X_train)[:, 1]
 bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), 
                           y_objective = y_train["y_has_purchased"].sum(),
                           initial_guess=6)
 # create a score adjusted with the bias computed
 score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
 X_test_segment["score_adjusted"] = score_adjusted_train
-# plot adjusted scores and save (to be tested)
+### 1. plot adjusted scores and save (to be tested)
 plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
 save_file_s3_ca("hist_score_adjusted_", type_of_activity)
 """
 image_buffer = io.BytesIO()
 plt.savefig(image_buffer, format='png')
 image_buffer.seek(0)
@ -53,27 +61,33 @@ FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png"
 with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
    s3_file.write(image_buffer.read())
 plt.close()
 """
-# comparison between score and adjusted score
+### 2. comparison between score and adjusted score
 X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
 X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
-file_name = "table_adjusted_score"
+# save table
 file_name = "table_adjusted_score_"
 FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
 with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    X_test_table_adjusted_scores.to_csv(file_out, index = False)
 # project revenue
-X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=1.5, duration_projection=1)
+X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=17, duration_projection=12)
-# table summarizing projections
+### 3. table summarizing projections (nb tickets, revenue)
 X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2)
-file_name = "table_expected_CA"
+# rename columns
 mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
 X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)
 # save table
 file_name = "table_expected_CA_"
 FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
 with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
    X_test_expected_CA.to_csv(file_out, index = False)
--- a/Sport/Modelization/CA_segment_sport.ipynb
+++ b/Sport/Modelization/CA_segment_sport.ipynb
--- a/utils_CA_segment.py
+++ b/utils_CA_segment.py
@ -1,3 +1,83 @@
 # importations
 import pandas as pd
 from pandas import DataFrame
 import numpy as np
 import os
 import s3fs
 import matplotlib.pyplot as plt
 from scipy.optimize import fsolve
 import pickle
 import warnings
 import io
 # functions
 def load_train_test(type_of_activity):
    BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
    File_path_train = BUCKET + "/Train_set.csv"
    File_path_test = BUCKET + "/Test_set.csv"
    with fs.open( File_path_train, mode="rb") as file_in:
        dataset_train = pd.read_csv(file_in, sep=",")
        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
    with fs.open(File_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
    return dataset_train, dataset_test
 def features_target_split(dataset_train, dataset_test):
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'nb_tickets_internet', 'fidelity',  'is_email_true', 'opt_in', #'is_partner',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
    # we suppress fidelity, time between purchase, and gender other (colinearity issue)
    """
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 
                  'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet',  'is_email_true', 
                  'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
    """
    X_train = dataset_train[features_l]
    y_train = dataset_train[['y_has_purchased']]
    X_test = dataset_test[features_l]
    y_test = dataset_test[['y_has_purchased']]
    return X_train, X_test, y_train, y_test
 def load_model(type_of_activity, model):
    BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
    filename = model + '.pkl'
    file_path = BUCKET + filename
    with fs.open(file_path, mode="rb") as f:
        model_bytes = f.read()
    model = pickle.loads(model_bytes)
    return model
 def df_segment(df, y, model) :
    y_pred = model.predict(df)
    y_pred_prob = model.predict_proba(df)[:, 1]
    df_segment = df
    df_segment["has_purchased"] = y
    df_segment["has_purchased_estim"] = y_pred
    df_segment["score"] = y_pred_prob
    df_segment["quartile"] = np.where(df_segment['score']<0.25, '1',
                       np.where(df_segment['score']<0.5, '2',
                       np.where(df_segment['score']<0.75, '3', '4')))
    return df_segment
 def odd_ratio(score) :
    """
    Args:
@ -152,3 +232,14 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
    df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values
    return df_expected_CA
 def save_file_s3_ca(File_name, type_of_activity):
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png')
    image_buffer.seek(0)
    PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
    FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
        s3_file.write(image_buffer.read())
    plt.close()