diff --git a/0_5_Machine_Learning.py b/0_5_Machine_Learning.py index 4e43afd..1700766 100644 --- a/0_5_Machine_Learning.py +++ b/0_5_Machine_Learning.py @@ -34,13 +34,15 @@ warnings.filterwarnings("ignore", category=DataConversionWarning) # choose the type of companies for which you want to run the pipeline type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') +# choose the type of model +type_of_model = input('Choisissez le type de model : basique ? premium ?') # load train and test set # Create filesystem object S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"] fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL}) -dataset_train, dataset_test = load_train_test(type_of_activity ) +dataset_train, dataset_test = load_train_test(type_of_activity) X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test) @@ -54,30 +56,7 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} - -numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', - 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', - 'nb_campaigns', 'nb_campaigns_opened'] - -numeric_transformer = Pipeline(steps=[ - #("imputer", SimpleImputer(strategy="mean")), - ("scaler", StandardScaler()) -]) - -categorical_features = ['opt_in', 'gender_male', 'gender_female'] - -# Transformer for the categorical features -categorical_transformer = Pipeline(steps=[ - #("imputer", SimpleImputer(strategy="most_frequent")), # Impute missing values with the most frequent - ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) -]) - -preproc = ColumnTransformer( - transformers=[ - ("num", numeric_transformer, numeric_features), - ("cat", categorical_transformer, categorical_features) - ] -) +preproc = preprocess(type_of_model) # Object for storing results model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"]) @@ -100,4 +79,4 @@ model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_resul print("Random Forest CV: Done") # Save result -save_result_set_s3(model_result , "resultat", type_of_activity) \ No newline at end of file +save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model) \ No newline at end of file diff --git a/utils_ml.py b/utils_ml.py index 17ac85a..767f7db 100644 --- a/utils_ml.py +++ b/utils_ml.py @@ -43,29 +43,29 @@ def load_train_test(type_of_activity): return dataset_train, dataset_test -def save_file_s3(File_name, type_of_activity, model): +def save_file_s3(File_name, type_of_activity, type_of_model, model): image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png') image_buffer.seek(0) - FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + FILE_PATH = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png' with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: s3_file.write(image_buffer.read()) plt.close() -def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False): +def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False): if model_path: - FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv' + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv' else: - FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv' + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/" + File_name + '.csv' with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: result_set.to_csv(file_out, index = False) -def save_model_s3(File_name, type_of_activity, model, classifier): +def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier): model_bytes = pickle.dumps(classifier) - FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl' + FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl' with fs.open(FILE_PATH_OUT_S3, 'wb') as f: f.write(model_bytes) @@ -79,13 +79,13 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): test['prediction'] = y_pred test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0] recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score') - save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True) + save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True) def features_target_split(dataset_train, dataset_test): features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner', - 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] + 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened', 'country_fr'] X_train = dataset_train[features_l] y_train = dataset_train[['y_has_purchased']] @@ -94,6 +94,44 @@ def features_target_split(dataset_train, dataset_test): return X_train, X_test, y_train, y_test +def preprocess(type_of_model): + if type_of_model=='premium': + numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', + 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', + 'nb_campaigns', 'nb_campaigns_opened'] + + binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] + categorical_features = ['opt_in'] + + else: + numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', + 'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', + 'nb_campaigns', 'nb_campaigns_opened'] + + binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] + categorical_features = ['opt_in'] + + numeric_transformer = Pipeline(steps=[ + ("scaler", StandardScaler()) + ]) + categorical_features = ['opt_in'] + categorical_transformer = Pipeline(steps=[ + ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) + ]) + + binary_transformer = Pipeline(steps=[ + ("imputer", SimpleImputer(strategy="most_frequent")), + ]) + preproc = ColumnTransformer( + transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ("bin", binary_transformer, binary_features) + ] + ) + return preproc + + def draw_confusion_matrix(y_test, y_pred, model): conf_matrix = confusion_matrix(y_test, y_pred) sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1']) @@ -101,7 +139,7 @@ def draw_confusion_matrix(y_test, y_pred, model): plt.ylabel('Actual') plt.title('Confusion Matrix') plt.show() - save_file_s3("Confusion_matrix_", type_of_activity, model) + save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model) def draw_roc_curve(X_test, y_pred_prob, model): @@ -120,7 +158,7 @@ def draw_roc_curve(X_test, y_pred_prob, model): plt.title("ROC Curve", size=18) plt.legend(loc="lower right") plt.show() - save_file_s3("Roc_curve_", type_of_activity, model) + save_file_s3("Roc_curve_", type_of_activity, type_of_model, model) def draw_calibration_curve(X_test, y_pred_prob, model): @@ -134,7 +172,7 @@ def draw_calibration_curve(X_test, y_pred_prob, model): plt.title("Calibration Curve") plt.legend() plt.show() - save_file_s3("Calib_curve_", type_of_activity, model) + save_file_s3("Calib_curve_", type_of_activity, type_of_model, model) def draw_features_importance(pipeline, model, randomF = False): @@ -145,18 +183,18 @@ def draw_features_importance(pipeline, model, randomF = False): feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out() # Tracer l'importance des caractéristiques - plt.figure(figsize=(10, 6)) + plt.figure(figsize=(12, 8)) plt.barh(feature_names, coefficients, color='skyblue') plt.xlabel("Features' Importance") plt.ylabel('Caractéristiques') plt.title("Features' Importance") plt.grid(True) plt.show() - save_file_s3("Features_", type_of_activity, model) + save_file_s3("Features_", type_of_activity, type_of_model, model) def draw_prob_distribution(y_pred_prob, model): - plt.figure(figsize=(8, 6)) + plt.figure(figsize=(10, 8)) plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7) plt.xlim(0, 1) @@ -167,7 +205,7 @@ def draw_prob_distribution(y_pred_prob, model): plt.ylabel('Frequency') plt.grid(True) plt.show() - save_file_s3("prob_dist_", type_of_activity, model) + save_file_s3("prob_dist_", type_of_activity, type_of_model, model) def draw_prob_distribution_companies(y_pred_prob, model): @@ -183,7 +221,7 @@ def draw_prob_distribution_companies(y_pred_prob, model): plt.ylabel('Frequency') plt.grid(True) plt.show() - save_file_s3("prob_dist_companies_", type_of_activity, model) + save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model) @@ -215,7 +253,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline) + save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline) return model_result @@ -252,7 +290,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search) + save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search) return model_result @@ -284,7 +322,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline) + save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline) return model_result @@ -325,7 +363,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): draw_prob_distribution(y_pred_prob, model) draw_prob_distribution_companies(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('randomF_cv', type_of_activity, model, grid_search) + save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search) return model_result @@ -356,5 +394,5 @@ def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result draw_roc_curve(X_test, y_pred_prob, model) draw_prob_distribution(y_pred_prob, model) draw_calibration_curve(X_test, y_pred_prob, model) - save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline) + save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline) return model_result \ No newline at end of file