add path premium
This commit is contained in:
		
							parent
							
								
									f4b6f23394
								
							
						
					
					
						commit
						133eb83e84
					
				|  | @ -34,6 +34,8 @@ warnings.filterwarnings("ignore", category=DataConversionWarning) | ||||||
| 
 | 
 | ||||||
| # choose the type of companies for which you want to run the pipeline | # choose the type of companies for which you want to run the pipeline | ||||||
| type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') | type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?') | ||||||
|  | # choose the type of model | ||||||
|  | type_of_model = input('Choisissez le type de model : basique ? premium ?') | ||||||
| 
 | 
 | ||||||
| # load train and test set | # load train and test set | ||||||
| # Create filesystem object | # Create filesystem object | ||||||
|  | @ -54,30 +56,7 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = | ||||||
| 
 | 
 | ||||||
| weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} | weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))} | ||||||
| 
 | 
 | ||||||
| 
 | preproc = preprocess(type_of_model) | ||||||
| numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',  |  | ||||||
|                     'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', |  | ||||||
|                     'nb_campaigns', 'nb_campaigns_opened'] |  | ||||||
| 
 |  | ||||||
| numeric_transformer = Pipeline(steps=[ |  | ||||||
|     #("imputer", SimpleImputer(strategy="mean")),   |  | ||||||
|     ("scaler", StandardScaler())  |  | ||||||
| ]) |  | ||||||
| 
 |  | ||||||
| categorical_features = ['opt_in', 'gender_male', 'gender_female']   |  | ||||||
| 
 |  | ||||||
| # Transformer for the categorical features |  | ||||||
| categorical_transformer = Pipeline(steps=[ |  | ||||||
|     #("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values with the most frequent |  | ||||||
|     ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) |  | ||||||
| ]) |  | ||||||
| 
 |  | ||||||
| preproc = ColumnTransformer( |  | ||||||
|     transformers=[ |  | ||||||
|         ("num", numeric_transformer, numeric_features), |  | ||||||
|         ("cat", categorical_transformer, categorical_features) |  | ||||||
|     ] |  | ||||||
| ) |  | ||||||
| 
 | 
 | ||||||
| # Object for storing results | # Object for storing results | ||||||
| model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"]) | model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"]) | ||||||
|  | @ -100,4 +79,4 @@ model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_resul | ||||||
| print("Random Forest CV: Done") | print("Random Forest CV: Done") | ||||||
| 
 | 
 | ||||||
| # Save result | # Save result | ||||||
| save_result_set_s3(model_result , "resultat", type_of_activity) | save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model) | ||||||
							
								
								
									
										82
									
								
								utils_ml.py
									
									
									
									
									
								
							
							
						
						
									
										82
									
								
								utils_ml.py
									
									
									
									
									
								
							|  | @ -43,29 +43,29 @@ def load_train_test(type_of_activity): | ||||||
|     return dataset_train, dataset_test |     return dataset_train, dataset_test | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def save_file_s3(File_name, type_of_activity, model): | def save_file_s3(File_name, type_of_activity, type_of_model, model): | ||||||
|     image_buffer = io.BytesIO() |     image_buffer = io.BytesIO() | ||||||
|     plt.savefig(image_buffer, format='png') |     plt.savefig(image_buffer, format='png') | ||||||
|     image_buffer.seek(0) |     image_buffer.seek(0) | ||||||
|     FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" |     FILE_PATH = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" | ||||||
|     FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png' |     FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png' | ||||||
|     with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: |     with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: | ||||||
|         s3_file.write(image_buffer.read()) |         s3_file.write(image_buffer.read()) | ||||||
|     plt.close() |     plt.close() | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False): | def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False): | ||||||
|     if model_path: |     if model_path: | ||||||
|         FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv' |         FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv' | ||||||
|     else: |     else: | ||||||
|         FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv' |         FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/" + File_name + '.csv' | ||||||
|     with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: |     with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: | ||||||
|         result_set.to_csv(file_out, index = False) |         result_set.to_csv(file_out, index = False) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def save_model_s3(File_name, type_of_activity, model, classifier): | def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier): | ||||||
|     model_bytes = pickle.dumps(classifier) |     model_bytes = pickle.dumps(classifier) | ||||||
|     FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl' |     FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl' | ||||||
|     with fs.open(FILE_PATH_OUT_S3, 'wb') as f: |     with fs.open(FILE_PATH_OUT_S3, 'wb') as f: | ||||||
|         f.write(model_bytes) |         f.write(model_bytes) | ||||||
|      |      | ||||||
|  | @ -79,13 +79,13 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model): | ||||||
|     test['prediction'] = y_pred |     test['prediction'] = y_pred | ||||||
|     test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0] |     test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0] | ||||||
|     recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score') |     recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score') | ||||||
|     save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True) |     save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def features_target_split(dataset_train, dataset_test): | def features_target_split(dataset_train, dataset_test): | ||||||
|     features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',  |     features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',  | ||||||
|             'time_between_purchase', 'nb_tickets_internet',  'is_email_true', 'opt_in', #'is_partner', |             'time_between_purchase', 'nb_tickets_internet',  'is_email_true', 'opt_in', #'is_partner', | ||||||
|             'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened'] |             'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened', 'country_fr'] | ||||||
|     X_train = dataset_train[features_l] |     X_train = dataset_train[features_l] | ||||||
|     y_train = dataset_train[['y_has_purchased']] |     y_train = dataset_train[['y_has_purchased']] | ||||||
| 
 | 
 | ||||||
|  | @ -94,6 +94,44 @@ def features_target_split(dataset_train, dataset_test): | ||||||
|     return X_train, X_test, y_train, y_test |     return X_train, X_test, y_train, y_test | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def preprocess(type_of_model): | ||||||
|  |     if type_of_model=='premium': | ||||||
|  |         numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',  | ||||||
|  |                     'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', | ||||||
|  |                     'nb_campaigns', 'nb_campaigns_opened'] | ||||||
|  | 
 | ||||||
|  |         binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] | ||||||
|  |         categorical_features = ['opt_in'] | ||||||
|  | 
 | ||||||
|  |     else:  | ||||||
|  |         numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max',  | ||||||
|  |                     'purchase_date_min', 'purchase_date_max', 'time_between_purchase', 'nb_tickets_internet', | ||||||
|  |                     'nb_campaigns', 'nb_campaigns_opened'] | ||||||
|  | 
 | ||||||
|  |         binary_features = ['gender_female', 'gender_male', 'gender_other', 'country_fr'] | ||||||
|  |         categorical_features = ['opt_in'] | ||||||
|  |          | ||||||
|  |     numeric_transformer = Pipeline(steps=[ | ||||||
|  |         ("scaler", StandardScaler())  | ||||||
|  |     ]) | ||||||
|  |     categorical_features = ['opt_in']   | ||||||
|  |     categorical_transformer = Pipeline(steps=[ | ||||||
|  |         ("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False)) | ||||||
|  |     ]) | ||||||
|  | 
 | ||||||
|  |     binary_transformer = Pipeline(steps=[ | ||||||
|  |         ("imputer", SimpleImputer(strategy="most_frequent")),   | ||||||
|  |     ]) | ||||||
|  |     preproc = ColumnTransformer( | ||||||
|  |         transformers=[ | ||||||
|  |             ("num", numeric_transformer, numeric_features), | ||||||
|  |             ("cat", categorical_transformer, categorical_features), | ||||||
|  |             ("bin", binary_transformer, binary_features) | ||||||
|  |         ] | ||||||
|  |     ) | ||||||
|  |     return preproc | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def draw_confusion_matrix(y_test, y_pred, model): | def draw_confusion_matrix(y_test, y_pred, model): | ||||||
|     conf_matrix = confusion_matrix(y_test, y_pred) |     conf_matrix = confusion_matrix(y_test, y_pred) | ||||||
|     sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1']) |     sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1']) | ||||||
|  | @ -101,7 +139,7 @@ def draw_confusion_matrix(y_test, y_pred, model): | ||||||
|     plt.ylabel('Actual') |     plt.ylabel('Actual') | ||||||
|     plt.title('Confusion Matrix') |     plt.title('Confusion Matrix') | ||||||
|     plt.show() |     plt.show() | ||||||
|     save_file_s3("Confusion_matrix_", type_of_activity, model) |     save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def draw_roc_curve(X_test, y_pred_prob, model): | def draw_roc_curve(X_test, y_pred_prob, model): | ||||||
|  | @ -120,7 +158,7 @@ def draw_roc_curve(X_test, y_pred_prob, model): | ||||||
|     plt.title("ROC Curve", size=18) |     plt.title("ROC Curve", size=18) | ||||||
|     plt.legend(loc="lower right") |     plt.legend(loc="lower right") | ||||||
|     plt.show() |     plt.show() | ||||||
|     save_file_s3("Roc_curve_", type_of_activity, model) |     save_file_s3("Roc_curve_", type_of_activity, type_of_model, model) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def draw_calibration_curve(X_test, y_pred_prob, model): | def draw_calibration_curve(X_test, y_pred_prob, model): | ||||||
|  | @ -134,7 +172,7 @@ def draw_calibration_curve(X_test, y_pred_prob, model): | ||||||
|     plt.title("Calibration Curve") |     plt.title("Calibration Curve") | ||||||
|     plt.legend() |     plt.legend() | ||||||
|     plt.show() |     plt.show() | ||||||
|     save_file_s3("Calib_curve_", type_of_activity, model) |     save_file_s3("Calib_curve_", type_of_activity, type_of_model, model) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def draw_features_importance(pipeline, model, randomF = False): | def draw_features_importance(pipeline, model, randomF = False): | ||||||
|  | @ -145,18 +183,18 @@ def draw_features_importance(pipeline, model, randomF = False): | ||||||
|      |      | ||||||
|     feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out() |     feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out() | ||||||
|     # Tracer l'importance des caractéristiques |     # Tracer l'importance des caractéristiques | ||||||
|     plt.figure(figsize=(10, 6)) |     plt.figure(figsize=(12, 8)) | ||||||
|     plt.barh(feature_names, coefficients, color='skyblue') |     plt.barh(feature_names, coefficients, color='skyblue') | ||||||
|     plt.xlabel("Features' Importance") |     plt.xlabel("Features' Importance") | ||||||
|     plt.ylabel('Caractéristiques') |     plt.ylabel('Caractéristiques') | ||||||
|     plt.title("Features' Importance") |     plt.title("Features' Importance") | ||||||
|     plt.grid(True) |     plt.grid(True) | ||||||
|     plt.show() |     plt.show() | ||||||
|     save_file_s3("Features_", type_of_activity, model) |     save_file_s3("Features_", type_of_activity, type_of_model, model) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def draw_prob_distribution(y_pred_prob, model): | def draw_prob_distribution(y_pred_prob, model): | ||||||
|     plt.figure(figsize=(8, 6)) |     plt.figure(figsize=(10, 8)) | ||||||
|     plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7) |     plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7) | ||||||
|      |      | ||||||
|     plt.xlim(0, 1) |     plt.xlim(0, 1) | ||||||
|  | @ -167,7 +205,7 @@ def draw_prob_distribution(y_pred_prob, model): | ||||||
|     plt.ylabel('Frequency') |     plt.ylabel('Frequency') | ||||||
|     plt.grid(True) |     plt.grid(True) | ||||||
|     plt.show() |     plt.show() | ||||||
|     save_file_s3("prob_dist_", type_of_activity, model) |     save_file_s3("prob_dist_", type_of_activity, type_of_model, model) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def draw_prob_distribution_companies(y_pred_prob, model): | def draw_prob_distribution_companies(y_pred_prob, model): | ||||||
|  | @ -183,7 +221,7 @@ def draw_prob_distribution_companies(y_pred_prob, model): | ||||||
|     plt.ylabel('Frequency') |     plt.ylabel('Frequency') | ||||||
|     plt.grid(True) |     plt.grid(True) | ||||||
|     plt.show() |     plt.show() | ||||||
|     save_file_s3("prob_dist_companies_", type_of_activity, model) |     save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -215,7 +253,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result): | ||||||
|     draw_prob_distribution(y_pred_prob, model) |     draw_prob_distribution(y_pred_prob, model) | ||||||
|     draw_prob_distribution_companies(y_pred_prob, model) |     draw_prob_distribution_companies(y_pred_prob, model) | ||||||
|     draw_calibration_curve(X_test, y_pred_prob, model) |     draw_calibration_curve(X_test, y_pred_prob, model) | ||||||
|     save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline) |     save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline) | ||||||
|     return model_result |     return model_result | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -252,7 +290,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result): | ||||||
|     draw_prob_distribution(y_pred_prob, model) |     draw_prob_distribution(y_pred_prob, model) | ||||||
|     draw_prob_distribution_companies(y_pred_prob, model) |     draw_prob_distribution_companies(y_pred_prob, model) | ||||||
|     draw_calibration_curve(X_test, y_pred_prob, model) |     draw_calibration_curve(X_test, y_pred_prob, model) | ||||||
|     save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search) |     save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search) | ||||||
|     return model_result |     return model_result | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -284,7 +322,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result): | ||||||
|     draw_prob_distribution(y_pred_prob, model) |     draw_prob_distribution(y_pred_prob, model) | ||||||
|     draw_prob_distribution_companies(y_pred_prob, model) |     draw_prob_distribution_companies(y_pred_prob, model) | ||||||
|     draw_calibration_curve(X_test, y_pred_prob, model) |     draw_calibration_curve(X_test, y_pred_prob, model) | ||||||
|     save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline) |     save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline) | ||||||
|     return model_result |     return model_result | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -325,7 +363,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result): | ||||||
|     draw_prob_distribution(y_pred_prob, model) |     draw_prob_distribution(y_pred_prob, model) | ||||||
|     draw_prob_distribution_companies(y_pred_prob, model) |     draw_prob_distribution_companies(y_pred_prob, model) | ||||||
|     draw_calibration_curve(X_test, y_pred_prob, model) |     draw_calibration_curve(X_test, y_pred_prob, model) | ||||||
|     save_model_s3('randomF_cv', type_of_activity, model, grid_search) |     save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search) | ||||||
|     return model_result |     return model_result | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -356,5 +394,5 @@ def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result | ||||||
|     draw_roc_curve(X_test, y_pred_prob, model) |     draw_roc_curve(X_test, y_pred_prob, model) | ||||||
|     draw_prob_distribution(y_pred_prob, model) |     draw_prob_distribution(y_pred_prob, model) | ||||||
|     draw_calibration_curve(X_test, y_pred_prob, model) |     draw_calibration_curve(X_test, y_pred_prob, model) | ||||||
|     save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline) |     save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline) | ||||||
|     return model_result |     return model_result | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user