# importations import pandas as pd from pandas import DataFrame import numpy as np import os import s3fs import re from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score from sklearn.utils import class_weight from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score import seaborn as sns import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score from sklearn.exceptions import ConvergenceWarning, DataConversionWarning from sklearn.naive_bayes import GaussianNB from scipy.optimize import fsolve import pickle import warnings import io # define type of activity type_of_activity = "sport" PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/" # comparison with bias of the train set - X train to be defined X_train_score = logit_cv.predict_proba(X_train)[:, 1] bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)), y_objective = y_train["y_has_purchased"].sum(), initial_guess=6) score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set) X_test_segment["score_adjusted"] = score_adjusted_train # plot adjusted scores and save (to be tested) plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted", type_of_activity = type_of_activity) image_buffer = io.BytesIO() plt.savefig(image_buffer, format='png') image_buffer.seek(0) file_name = "hist_score_adjusted" FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png" with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file: s3_file.write(image_buffer.read()) plt.close() # comparison between score and adjusted score X_test_table_adjusted_scores = (100 * X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean()).round(2).reset_index() X_test_table_adjusted_scores = X_test_table_adjusted_scores.rename(columns = {col : f"{col} (%)" for col in X_test_table_adjusted_scores.columns if col in ["score","score_adjusted", "has_purchased"]}) file_name = "table_adjusted_score" FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: X_test_table_adjusted_scores.to_csv(file_out, index = False) # project revenue X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=1.5, duration_projection=1) # table summarizing projections X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2) file_name = "table_expected_CA" FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv" with fs.open(FILE_PATH_OUT_S3, 'w') as file_out: X_test_expected_CA.to_csv(file_out, index = False)