completed CA projection
This commit is contained in:
		
							parent
							
								
									cf0b33c940
								
							
						
					
					
						commit
						d3e13f4c56
					
				| 
						 | 
					@ -4,47 +4,55 @@ from pandas import DataFrame
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import s3fs
 | 
					import s3fs
 | 
				
			||||||
import re
 | 
					 | 
				
			||||||
from sklearn.linear_model import LogisticRegression
 | 
					 | 
				
			||||||
from sklearn.ensemble import RandomForestClassifier
 | 
					 | 
				
			||||||
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
 | 
					 | 
				
			||||||
from sklearn.utils import class_weight
 | 
					 | 
				
			||||||
from sklearn.neighbors import KNeighborsClassifier
 | 
					 | 
				
			||||||
from sklearn.pipeline import Pipeline
 | 
					 | 
				
			||||||
from sklearn.compose import ColumnTransformer
 | 
					 | 
				
			||||||
from sklearn.preprocessing import OneHotEncoder
 | 
					 | 
				
			||||||
from sklearn.impute import SimpleImputer
 | 
					 | 
				
			||||||
from sklearn.model_selection import GridSearchCV
 | 
					 | 
				
			||||||
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
 | 
					 | 
				
			||||||
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
 | 
					 | 
				
			||||||
import seaborn as sns
 | 
					 | 
				
			||||||
import matplotlib.pyplot as plt
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
 | 
					 | 
				
			||||||
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
 | 
					 | 
				
			||||||
from sklearn.naive_bayes import GaussianNB
 | 
					 | 
				
			||||||
from scipy.optimize import fsolve
 | 
					from scipy.optimize import fsolve
 | 
				
			||||||
import pickle
 | 
					import pickle
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
import io
 | 
					import io
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# importation of functions defined
 | 
				
			||||||
 | 
					from utils_CA_segment import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Create filesystem object
 | 
				
			||||||
 | 
					S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
 | 
				
			||||||
 | 
					fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# define type of activity 
 | 
					# define type of activity 
 | 
				
			||||||
type_of_activity = "sport"
 | 
					type_of_activity = "sport"
 | 
				
			||||||
PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
 | 
					PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# type of model for the score
 | 
				
			||||||
 | 
					type_of_model = "LogisticRegression_cv"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# load train and test sets
 | 
				
			||||||
 | 
					dataset_train, dataset_test = load_train_test(type_of_activity)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# make features - define X train and X test
 | 
				
			||||||
 | 
					X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# choose model - logit cross validated
 | 
				
			||||||
 | 
					model = load_model(type_of_activity, type_of_model)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# create table X test segment from X test
 | 
				
			||||||
 | 
					X_test_segment = df_segment(X_test, y_test, model)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# comparison with bias of the train set - X train to be defined
 | 
					# comparison with bias of the train set - X train to be defined
 | 
				
			||||||
X_train_score = logit_cv.predict_proba(X_train)[:, 1]
 | 
					X_train_score = model.predict_proba(X_train)[:, 1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), 
 | 
					bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), 
 | 
				
			||||||
                           y_objective = y_train["y_has_purchased"].sum(),
 | 
					                           y_objective = y_train["y_has_purchased"].sum(),
 | 
				
			||||||
                           initial_guess=6)
 | 
					                           initial_guess=6)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# create a score adjusted with the bias computed
 | 
				
			||||||
score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
 | 
					score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
 | 
				
			||||||
X_test_segment["score_adjusted"] = score_adjusted_train
 | 
					X_test_segment["score_adjusted"] = score_adjusted_train
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# plot adjusted scores and save (to be tested)
 | 
					### 1. plot adjusted scores and save (to be tested)
 | 
				
			||||||
plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
 | 
					plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity)
 | 
				
			||||||
 | 
					save_file_s3_ca("hist_score_adjusted_", type_of_activity)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
image_buffer = io.BytesIO()
 | 
					image_buffer = io.BytesIO()
 | 
				
			||||||
plt.savefig(image_buffer, format='png')
 | 
					plt.savefig(image_buffer, format='png')
 | 
				
			||||||
image_buffer.seek(0)
 | 
					image_buffer.seek(0)
 | 
				
			||||||
| 
						 | 
					@ -53,27 +61,33 @@ FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png"
 | 
				
			||||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
 | 
					with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
 | 
				
			||||||
    s3_file.write(image_buffer.read())
 | 
					    s3_file.write(image_buffer.read())
 | 
				
			||||||
plt.close()
 | 
					plt.close()
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# comparison between score and adjusted score
 | 
					### 2. comparison between score and adjusted score
 | 
				
			||||||
X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
 | 
					X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index()
 | 
				
			||||||
X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
 | 
					X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
file_name = "table_adjusted_score"
 | 
					# save table
 | 
				
			||||||
 | 
					file_name = "table_adjusted_score_"
 | 
				
			||||||
FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
 | 
					FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
 | 
				
			||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
 | 
					with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
 | 
				
			||||||
    X_test_table_adjusted_scores.to_csv(file_out, index = False)
 | 
					    X_test_table_adjusted_scores.to_csv(file_out, index = False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# project revenue
 | 
					# project revenue
 | 
				
			||||||
X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=1.5, duration_projection=1)
 | 
					X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=17, duration_projection=12)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# table summarizing projections
 | 
					### 3. table summarizing projections (nb tickets, revenue)
 | 
				
			||||||
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2)
 | 
					X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
file_name = "table_expected_CA"
 | 
					# rename columns
 | 
				
			||||||
 | 
					mapping_dict = {col: col.replace("perct", "(%)").replace("_", " ") for col in X_test_expected_CA.columns}
 | 
				
			||||||
 | 
					X_test_expected_CA = X_test_expected_CA.rename(columns=mapping_dict)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# save table
 | 
				
			||||||
 | 
					file_name = "table_expected_CA_"
 | 
				
			||||||
FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
 | 
					FILE_PATH_OUT_S3 = PATH + file_name +  type_of_activity + ".csv"
 | 
				
			||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
 | 
					with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
 | 
				
			||||||
    X_test_expected_CA.to_csv(file_out, index = False)
 | 
					    X_test_expected_CA.to_csv(file_out, index = False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| 
						 | 
					@ -1,3 +1,83 @@
 | 
				
			||||||
 | 
					# importations
 | 
				
			||||||
 | 
					import pandas as pd
 | 
				
			||||||
 | 
					from pandas import DataFrame
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import s3fs
 | 
				
			||||||
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
 | 
					from scipy.optimize import fsolve
 | 
				
			||||||
 | 
					import pickle
 | 
				
			||||||
 | 
					import warnings
 | 
				
			||||||
 | 
					import io
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# functions
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def load_train_test(type_of_activity):
 | 
				
			||||||
 | 
					    BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
 | 
				
			||||||
 | 
					    File_path_train = BUCKET + "/Train_set.csv"
 | 
				
			||||||
 | 
					    File_path_test = BUCKET + "/Test_set.csv"
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    with fs.open( File_path_train, mode="rb") as file_in:
 | 
				
			||||||
 | 
					        dataset_train = pd.read_csv(file_in, sep=",")
 | 
				
			||||||
 | 
					        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with fs.open(File_path_test, mode="rb") as file_in:
 | 
				
			||||||
 | 
					        dataset_test = pd.read_csv(file_in, sep=",")
 | 
				
			||||||
 | 
					        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return dataset_train, dataset_test
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def features_target_split(dataset_train, dataset_test):
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
 | 
				
			||||||
 | 
					            'time_between_purchase', 'nb_tickets_internet', 'fidelity',  'is_email_true', 'opt_in', #'is_partner',
 | 
				
			||||||
 | 
					            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # we suppress fidelity, time between purchase, and gender other (colinearity issue)
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 
 | 
				
			||||||
 | 
					                  'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet',  'is_email_true', 
 | 
				
			||||||
 | 
					                  'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    X_train = dataset_train[features_l]
 | 
				
			||||||
 | 
					    y_train = dataset_train[['y_has_purchased']]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    X_test = dataset_test[features_l]
 | 
				
			||||||
 | 
					    y_test = dataset_test[['y_has_purchased']]
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return X_train, X_test, y_train, y_test
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def load_model(type_of_activity, model):
 | 
				
			||||||
 | 
					    BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
 | 
				
			||||||
 | 
					    filename = model + '.pkl'
 | 
				
			||||||
 | 
					    file_path = BUCKET + filename
 | 
				
			||||||
 | 
					    with fs.open(file_path, mode="rb") as f:
 | 
				
			||||||
 | 
					        model_bytes = f.read()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    model = pickle.loads(model_bytes)
 | 
				
			||||||
 | 
					    return model
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def df_segment(df, y, model) :
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    y_pred = model.predict(df)
 | 
				
			||||||
 | 
					    y_pred_prob = model.predict_proba(df)[:, 1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    df_segment = df
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    df_segment["has_purchased"] = y
 | 
				
			||||||
 | 
					    df_segment["has_purchased_estim"] = y_pred
 | 
				
			||||||
 | 
					    df_segment["score"] = y_pred_prob
 | 
				
			||||||
 | 
					    df_segment["quartile"] = np.where(df_segment['score']<0.25, '1',
 | 
				
			||||||
 | 
					                       np.where(df_segment['score']<0.5, '2',
 | 
				
			||||||
 | 
					                       np.where(df_segment['score']<0.75, '3', '4')))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return df_segment
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def odd_ratio(score) :
 | 
					def odd_ratio(score) :
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Args:
 | 
					    Args:
 | 
				
			||||||
| 
						 | 
					@ -152,3 +232,14 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
 | 
				
			||||||
    df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values
 | 
					    df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    return df_expected_CA
 | 
					    return df_expected_CA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def save_file_s3_ca(File_name, type_of_activity):
 | 
				
			||||||
 | 
					    image_buffer = io.BytesIO()
 | 
				
			||||||
 | 
					    plt.savefig(image_buffer, format='png')
 | 
				
			||||||
 | 
					    image_buffer.seek(0)
 | 
				
			||||||
 | 
					    PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
 | 
				
			||||||
 | 
					    FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
 | 
				
			||||||
 | 
					    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
 | 
				
			||||||
 | 
					        s3_file.write(image_buffer.read())
 | 
				
			||||||
 | 
					    plt.close()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user