add path premium

This commit is contained in:
Alexis REVELLE 2024-03-27 14:08:40 +00:00
parent f4b6f23394
commit 133eb83e84
2 changed files with 65 additions and 48 deletions

View File

@ -34,6 +34,8 @@ warnings.filterwarnings("ignore", category=DataConversionWarning)
# choose the type of companies for which you want to run the pipeline # choose the type of companies for which you want to run the pipeline
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
# choose the type of model
type_of_model = input('Choisissez le type de model : basique ? premium ?')
# load train and test set # load train and test set
# Create filesystem object # Create filesystem object
@ -54,30 +56,7 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes =
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
preproc = preprocess(type_of_model)
numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',
'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet',
'nb_campaigns', 'nb_campaigns_opened']
numeric_transformer = Pipeline(steps=[
#("imputer", SimpleImputer(strategy="mean")),
("scaler", StandardScaler())
])
categorical_features = ['opt_in', 'gender_male', 'gender_female']
# Transformer for the categorical features
categorical_transformer = Pipeline(steps=[
#("imputer", SimpleImputer(strategy="most_frequent")), # Impute missing values with the most frequent
("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preproc = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features)
]
)
# Object for storing results # Object for storing results
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"]) model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
@ -100,4 +79,4 @@ model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_resul
print("Random Forest CV: Done") print("Random Forest CV: Done")
# Save result # Save result
save_result_set_s3(model_result , "resultat", type_of_activity) save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)

View File

@ -43,29 +43,29 @@ def load_train_test(type_of_activity):
return dataset_train, dataset_test return dataset_train, dataset_test
def save_file_s3(File_name, type_of_activity, model): def save_file_s3(File_name, type_of_activity, type_of_model, model):
image_buffer = io.BytesIO() image_buffer = io.BytesIO()
plt.savefig(image_buffer, format='png') plt.savefig(image_buffer, format='png')
image_buffer.seek(0) image_buffer.seek(0)
FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" FILE_PATH = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/"
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png' FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
s3_file.write(image_buffer.read()) s3_file.write(image_buffer.read())
plt.close() plt.close()
def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False): def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
if model_path: if model_path:
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv' FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
else: else:
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv' FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/" + File_name + '.csv'
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
result_set.to_csv(file_out, index = False) result_set.to_csv(file_out, index = False)
def save_model_s3(File_name, type_of_activity, model, classifier): def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
model_bytes = pickle.dumps(classifier) model_bytes = pickle.dumps(classifier)
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl' FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
with fs.open(FILE_PATH_OUT_S3, 'wb') as f: with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
f.write(model_bytes) f.write(model_bytes)
@ -79,13 +79,13 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
test['prediction'] = y_pred test['prediction'] = y_pred
test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0] test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score') recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True) save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True)
def features_target_split(dataset_train, dataset_test): def features_target_split(dataset_train, dataset_test):
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened', 'country_fr']
X_train = dataset_train[features_l] X_train = dataset_train[features_l]
y_train = dataset_train[['y_has_purchased']] y_train = dataset_train[['y_has_purchased']]
@ -94,6 +94,44 @@ def features_target_split(dataset_train, dataset_test):
return X_train, X_test, y_train, y_test return X_train, X_test, y_train, y_test
def preprocess(type_of_model):
if type_of_model=='premium':
numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',
'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet',
'nb_campaigns', 'nb_campaigns_opened']
binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr']
categorical_features = ['opt_in']
else:
numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',
'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet',
'nb_campaigns', 'nb_campaigns_opened']
binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr']
categorical_features = ['opt_in']
numeric_transformer = Pipeline(steps=[
("scaler", StandardScaler())
])
categorical_features = ['opt_in']
categorical_transformer = Pipeline(steps=[
("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
binary_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
])
preproc = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
("bin", binary_transformer, binary_features)
]
)
return preproc
def draw_confusion_matrix(y_test, y_pred, model): def draw_confusion_matrix(y_test, y_pred, model):
conf_matrix = confusion_matrix(y_test, y_pred) conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1']) sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
@ -101,7 +139,7 @@ def draw_confusion_matrix(y_test, y_pred, model):
plt.ylabel('Actual') plt.ylabel('Actual')
plt.title('Confusion Matrix') plt.title('Confusion Matrix')
plt.show() plt.show()
save_file_s3("Confusion_matrix_", type_of_activity, model) save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model)
def draw_roc_curve(X_test, y_pred_prob, model): def draw_roc_curve(X_test, y_pred_prob, model):
@ -120,7 +158,7 @@ def draw_roc_curve(X_test, y_pred_prob, model):
plt.title("ROC Curve", size=18) plt.title("ROC Curve", size=18)
plt.legend(loc="lower right") plt.legend(loc="lower right")
plt.show() plt.show()
save_file_s3("Roc_curve_", type_of_activity, model) save_file_s3("Roc_curve_", type_of_activity, type_of_model, model)
def draw_calibration_curve(X_test, y_pred_prob, model): def draw_calibration_curve(X_test, y_pred_prob, model):
@ -134,7 +172,7 @@ def draw_calibration_curve(X_test, y_pred_prob, model):
plt.title("Calibration Curve") plt.title("Calibration Curve")
plt.legend() plt.legend()
plt.show() plt.show()
save_file_s3("Calib_curve_", type_of_activity, model) save_file_s3("Calib_curve_", type_of_activity, type_of_model, model)
def draw_features_importance(pipeline, model, randomF = False): def draw_features_importance(pipeline, model, randomF = False):
@ -145,18 +183,18 @@ def draw_features_importance(pipeline, model, randomF = False):
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out() feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
# Tracer l'importance des caractéristiques # Tracer l'importance des caractéristiques
plt.figure(figsize=(10, 6)) plt.figure(figsize=(12, 8))
plt.barh(feature_names, coefficients, color='skyblue') plt.barh(feature_names, coefficients, color='skyblue')
plt.xlabel("Features' Importance") plt.xlabel("Features' Importance")
plt.ylabel('Caractéristiques') plt.ylabel('Caractéristiques')
plt.title("Features' Importance") plt.title("Features' Importance")
plt.grid(True) plt.grid(True)
plt.show() plt.show()
save_file_s3("Features_", type_of_activity, model) save_file_s3("Features_", type_of_activity, type_of_model, model)
def draw_prob_distribution(y_pred_prob, model): def draw_prob_distribution(y_pred_prob, model):
plt.figure(figsize=(8, 6)) plt.figure(figsize=(10, 8))
plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7) plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
plt.xlim(0, 1) plt.xlim(0, 1)
@ -167,7 +205,7 @@ def draw_prob_distribution(y_pred_prob, model):
plt.ylabel('Frequency') plt.ylabel('Frequency')
plt.grid(True) plt.grid(True)
plt.show() plt.show()
save_file_s3("prob_dist_", type_of_activity, model) save_file_s3("prob_dist_", type_of_activity, type_of_model, model)
def draw_prob_distribution_companies(y_pred_prob, model): def draw_prob_distribution_companies(y_pred_prob, model):
@ -183,7 +221,7 @@ def draw_prob_distribution_companies(y_pred_prob, model):
plt.ylabel('Frequency') plt.ylabel('Frequency')
plt.grid(True) plt.grid(True)
plt.show() plt.show()
save_file_s3("prob_dist_companies_", type_of_activity, model) save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model)
@ -215,7 +253,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
draw_prob_distribution(y_pred_prob, model) draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline) save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline)
return model_result return model_result
@ -252,7 +290,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
draw_prob_distribution(y_pred_prob, model) draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search) save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search)
return model_result return model_result
@ -284,7 +322,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
draw_prob_distribution(y_pred_prob, model) draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline) save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline)
return model_result return model_result
@ -325,7 +363,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
draw_prob_distribution(y_pred_prob, model) draw_prob_distribution(y_pred_prob, model)
draw_prob_distribution_companies(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('randomF_cv', type_of_activity, model, grid_search) save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search)
return model_result return model_result
@ -356,5 +394,5 @@ def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result
draw_roc_curve(X_test, y_pred_prob, model) draw_roc_curve(X_test, y_pred_prob, model)
draw_prob_distribution(y_pred_prob, model) draw_prob_distribution(y_pred_prob, model)
draw_calibration_curve(X_test, y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model)
save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline) save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline)
return model_result return model_result