Merge pull request 'generalization' (#11) from generalization into main
Reviewed-on: #11
This commit is contained in:
commit
eb87cc6998
|
@ -47,7 +47,9 @@ customer['has_purchased_target_period'] = np.where(customer['customer_id'].isin(
|
||||||
# Generate graph and automatically saved them in the bucket
|
# Generate graph and automatically saved them in the bucket
|
||||||
compute_nb_clients(customer, type_of_activity)
|
compute_nb_clients(customer, type_of_activity)
|
||||||
|
|
||||||
maximum_price_paid(customer, type_of_activity)
|
#maximum_price_paid(customer, type_of_activity)
|
||||||
|
|
||||||
|
target_proportion(customer, type_of_activity)
|
||||||
|
|
||||||
mailing_consent(customer, type_of_activity)
|
mailing_consent(customer, type_of_activity)
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,8 @@ warnings.filterwarnings("ignore", category=DataConversionWarning)
|
||||||
|
|
||||||
# choose the type of companies for which you want to run the pipeline
|
# choose the type of companies for which you want to run the pipeline
|
||||||
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
type_of_activity = input('Choisissez le type de compagnie : sport ? musique ? musee ?')
|
||||||
|
# choose the type of model
|
||||||
|
type_of_model = input('Choisissez le type de model : basique ? premium ?')
|
||||||
|
|
||||||
# load train and test set
|
# load train and test set
|
||||||
# Create filesystem object
|
# Create filesystem object
|
||||||
|
@ -54,50 +56,32 @@ weights = class_weight.compute_class_weight(class_weight = 'balanced', classes =
|
||||||
|
|
||||||
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
|
weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}
|
||||||
|
|
||||||
|
preproc = preprocess(type_of_model, type_of_activity)
|
||||||
numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
|
|
||||||
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
|
|
||||||
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
|
|
||||||
|
|
||||||
numeric_transformer = Pipeline(steps=[
|
|
||||||
#("imputer", SimpleImputer(strategy="mean")),
|
|
||||||
("scaler", StandardScaler())
|
|
||||||
])
|
|
||||||
|
|
||||||
categorical_features = ['opt_in']
|
|
||||||
|
|
||||||
# Transformer for the categorical features
|
|
||||||
categorical_transformer = Pipeline(steps=[
|
|
||||||
#("imputer", SimpleImputer(strategy="most_frequent")), # Impute missing values with the most frequent
|
|
||||||
("onehot", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
|
|
||||||
])
|
|
||||||
|
|
||||||
preproc = ColumnTransformer(
|
|
||||||
transformers=[
|
|
||||||
("num", numeric_transformer, numeric_features),
|
|
||||||
("cat", categorical_transformer, categorical_features)
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Object for storing results
|
# Object for storing results
|
||||||
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
|
model_result = pd.DataFrame(columns= ["Model", "Accuracy", "Recall", "F1_score", "AUC"])
|
||||||
|
|
||||||
# Naive Bayes
|
# Naive Bayes
|
||||||
model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
|
model_result = pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result)
|
||||||
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||||
print("Naive Bayes : Done")
|
print("Naive Bayes : Done")
|
||||||
|
|
||||||
# Logistic Regression
|
# Logistic Regression
|
||||||
model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
|
model_result = pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result)
|
||||||
print("Logistic : Done")
|
print("Logistic : Done")
|
||||||
"""
|
|
||||||
model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
|
model_result = pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result)
|
||||||
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||||
print("Logistic CV : Done")
|
print("Logistic CV : Done")
|
||||||
|
|
||||||
# Random Forest
|
# Random Forest
|
||||||
model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
|
model_result = pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result)
|
||||||
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||||
print("Random Forest : Done")
|
print("Random Forest : Done")
|
||||||
|
|
||||||
model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
|
model_result = pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result)
|
||||||
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
||||||
print("Random Forest CV: Done")
|
print("Random Forest CV: Done")
|
||||||
"""
|
|
||||||
# Save result
|
# Save result
|
||||||
save_result_set_s3(model_result , "resultat", type_of_activity)
|
save_result_set_s3(model_result , "resultat", type_of_activity, type_of_model)
|
107
utils_ml.py
107
utils_ml.py
|
@ -28,7 +28,7 @@ import warnings
|
||||||
|
|
||||||
|
|
||||||
def load_train_test(type_of_activity):
|
def load_train_test(type_of_activity):
|
||||||
BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
|
BUCKET = f"projet-bdc2324-team1/Generalization_v2/{type_of_activity}"
|
||||||
File_path_train = BUCKET + "/Train_set.csv"
|
File_path_train = BUCKET + "/Train_set.csv"
|
||||||
File_path_test = BUCKET + "/Test_set.csv"
|
File_path_test = BUCKET + "/Test_set.csv"
|
||||||
|
|
||||||
|
@ -43,29 +43,29 @@ def load_train_test(type_of_activity):
|
||||||
return dataset_train, dataset_test
|
return dataset_train, dataset_test
|
||||||
|
|
||||||
|
|
||||||
def save_file_s3(File_name, type_of_activity, model):
|
def save_file_s3(File_name, type_of_activity, type_of_model, model):
|
||||||
image_buffer = io.BytesIO()
|
image_buffer = io.BytesIO()
|
||||||
plt.savefig(image_buffer, format='png')
|
plt.savefig(image_buffer, format='png')
|
||||||
image_buffer.seek(0)
|
image_buffer.seek(0)
|
||||||
FILE_PATH = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
|
FILE_PATH = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/"
|
||||||
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
|
FILE_PATH_OUT_S3 = FILE_PATH + File_name + type_of_activity + '_' + model + '.png'
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||||
s3_file.write(image_buffer.read())
|
s3_file.write(image_buffer.read())
|
||||||
plt.close()
|
plt.close()
|
||||||
|
|
||||||
|
|
||||||
def save_result_set_s3(result_set, File_name, type_of_activity, model=None, model_path=False):
|
def save_result_set_s3(result_set, File_name, type_of_activity, type_of_model, model=None, model_path=False):
|
||||||
if model_path:
|
if model_path:
|
||||||
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.csv'
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.csv'
|
||||||
else:
|
else:
|
||||||
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/" + File_name + '.csv'
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/" + File_name + '.csv'
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
result_set.to_csv(file_out, index = False)
|
result_set.to_csv(file_out, index = False)
|
||||||
|
|
||||||
|
|
||||||
def save_model_s3(File_name, type_of_activity, model, classifier):
|
def save_model_s3(File_name, type_of_activity, type_of_model, model, classifier):
|
||||||
model_bytes = pickle.dumps(classifier)
|
model_bytes = pickle.dumps(classifier)
|
||||||
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/" + File_name + '.pkl'
|
FILE_PATH_OUT_S3 = f"projet-bdc2324-team1/{type_of_model}/{type_of_activity}/{model}/" + File_name + '.pkl'
|
||||||
with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as f:
|
||||||
f.write(model_bytes)
|
f.write(model_bytes)
|
||||||
|
|
||||||
|
@ -78,14 +78,19 @@ def compute_recall_companies(dataset_test, y_pred, type_of_activity, model):
|
||||||
test = dataset_test.copy()
|
test = dataset_test.copy()
|
||||||
test['prediction'] = y_pred
|
test['prediction'] = y_pred
|
||||||
test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
|
test['company'] = dataset_test['customer_id'].str.split('_', expand=True)[0]
|
||||||
recall_scores_by_company = dataset_test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
|
recall_scores_by_company = test.groupby('company').apply(compute_recall).reset_index(name='recall_score')
|
||||||
save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, model=model, model_path=True)
|
save_result_set_s3(recall_scores_by_company, 'recall_scores_by_company', type_of_activity, type_of_model, model=model, model_path=True)
|
||||||
|
|
||||||
|
|
||||||
def features_target_split(dataset_train, dataset_test):
|
def features_target_split(dataset_train, dataset_test):
|
||||||
features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max',
|
features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',
|
||||||
'time_between_purchase', 'nb_tickets_internet', 'is_email_true', 'opt_in', #'is_partner',
|
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
||||||
'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']
|
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
|
||||||
|
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',
|
||||||
|
'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
|
||||||
|
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
|
||||||
|
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',
|
||||||
|
'target_jeune', 'target_abonne']
|
||||||
X_train = dataset_train[features_l]
|
X_train = dataset_train[features_l]
|
||||||
y_train = dataset_train[['y_has_purchased']]
|
y_train = dataset_train[['y_has_purchased']]
|
||||||
|
|
||||||
|
@ -94,6 +99,46 @@ def features_target_split(dataset_train, dataset_test):
|
||||||
return X_train, X_test, y_train, y_test
|
return X_train, X_test, y_train, y_test
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess(type_of_model, type_of_activity):
|
||||||
|
|
||||||
|
numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',
|
||||||
|
'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',
|
||||||
|
'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',
|
||||||
|
'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']
|
||||||
|
|
||||||
|
binary_features = ['gender_female', 'gender_male', 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',
|
||||||
|
'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',
|
||||||
|
'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in']
|
||||||
|
|
||||||
|
if type_of_activity=='musee':
|
||||||
|
binary_features.pop('time_to_open')
|
||||||
|
|
||||||
|
if type_of_model=='premium':
|
||||||
|
if type_of_activity=='musique':
|
||||||
|
binary_features.extend(['target_optin', 'target_newsletter'])
|
||||||
|
elif type_of_activity=='sport':
|
||||||
|
binary_features.extend(['target_jeune', 'target_entreprise', 'target_abonne'])
|
||||||
|
else:
|
||||||
|
binary_features.extend([ 'target_scolaire', 'target_entreprise', 'target_famille', 'target_newsletter'])
|
||||||
|
|
||||||
|
|
||||||
|
numeric_transformer = Pipeline(steps=[
|
||||||
|
("imputer", SimpleImputer(strategy="constant", fill_value=0)),
|
||||||
|
("scaler", StandardScaler())
|
||||||
|
])
|
||||||
|
|
||||||
|
binary_transformer = Pipeline(steps=[
|
||||||
|
("imputer", SimpleImputer(strategy="most_frequent")),
|
||||||
|
])
|
||||||
|
preproc = ColumnTransformer(
|
||||||
|
transformers=[
|
||||||
|
("num", numeric_transformer, numeric_features),
|
||||||
|
("bin", binary_transformer, binary_features)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return preproc
|
||||||
|
|
||||||
|
|
||||||
def draw_confusion_matrix(y_test, y_pred, model):
|
def draw_confusion_matrix(y_test, y_pred, model):
|
||||||
conf_matrix = confusion_matrix(y_test, y_pred)
|
conf_matrix = confusion_matrix(y_test, y_pred)
|
||||||
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
|
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
|
||||||
|
@ -101,7 +146,7 @@ def draw_confusion_matrix(y_test, y_pred, model):
|
||||||
plt.ylabel('Actual')
|
plt.ylabel('Actual')
|
||||||
plt.title('Confusion Matrix')
|
plt.title('Confusion Matrix')
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("Confusion_matrix_", type_of_activity, model)
|
save_file_s3("Confusion_matrix_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
def draw_roc_curve(X_test, y_pred_prob, model):
|
def draw_roc_curve(X_test, y_pred_prob, model):
|
||||||
|
@ -120,7 +165,7 @@ def draw_roc_curve(X_test, y_pred_prob, model):
|
||||||
plt.title("ROC Curve", size=18)
|
plt.title("ROC Curve", size=18)
|
||||||
plt.legend(loc="lower right")
|
plt.legend(loc="lower right")
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("Roc_curve_", type_of_activity, model)
|
save_file_s3("Roc_curve_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
def draw_calibration_curve(X_test, y_pred_prob, model):
|
def draw_calibration_curve(X_test, y_pred_prob, model):
|
||||||
|
@ -134,7 +179,7 @@ def draw_calibration_curve(X_test, y_pred_prob, model):
|
||||||
plt.title("Calibration Curve")
|
plt.title("Calibration Curve")
|
||||||
plt.legend()
|
plt.legend()
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("Calib_curve_", type_of_activity, model)
|
save_file_s3("Calib_curve_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
def draw_features_importance(pipeline, model, randomF = False):
|
def draw_features_importance(pipeline, model, randomF = False):
|
||||||
|
@ -145,18 +190,18 @@ def draw_features_importance(pipeline, model, randomF = False):
|
||||||
|
|
||||||
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
|
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()
|
||||||
# Tracer l'importance des caractéristiques
|
# Tracer l'importance des caractéristiques
|
||||||
plt.figure(figsize=(10, 6))
|
plt.figure(figsize=(12, 8))
|
||||||
plt.barh(feature_names, coefficients, color='skyblue')
|
plt.barh(feature_names, coefficients, color='skyblue')
|
||||||
plt.xlabel("Features' Importance")
|
plt.xlabel("Features' Importance")
|
||||||
plt.ylabel('Caractéristiques')
|
plt.ylabel('Caractéristiques')
|
||||||
plt.title("Features' Importance")
|
plt.title("Features' Importance")
|
||||||
plt.grid(True)
|
plt.grid(True)
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("Features_", type_of_activity, model)
|
save_file_s3("Features_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
def draw_prob_distribution(y_pred_prob, model):
|
def draw_prob_distribution(y_pred_prob, model):
|
||||||
plt.figure(figsize=(8, 6))
|
plt.figure(figsize=(10, 8))
|
||||||
plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
|
plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)
|
||||||
|
|
||||||
plt.xlim(0, 1)
|
plt.xlim(0, 1)
|
||||||
|
@ -167,7 +212,7 @@ def draw_prob_distribution(y_pred_prob, model):
|
||||||
plt.ylabel('Frequency')
|
plt.ylabel('Frequency')
|
||||||
plt.grid(True)
|
plt.grid(True)
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("prob_dist_", type_of_activity, model)
|
save_file_s3("prob_dist_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
def draw_prob_distribution_companies(y_pred_prob, model):
|
def draw_prob_distribution_companies(y_pred_prob, model):
|
||||||
|
@ -183,7 +228,7 @@ def draw_prob_distribution_companies(y_pred_prob, model):
|
||||||
plt.ylabel('Frequency')
|
plt.ylabel('Frequency')
|
||||||
plt.grid(True)
|
plt.grid(True)
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("prob_dist_companies_", type_of_activity, model)
|
save_file_s3("prob_dist_companies_", type_of_activity, type_of_model, model)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -207,7 +252,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||||
"AUC" : [auc(fpr, tpr)]}
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
)
|
)
|
||||||
model_result = pd.concat([model_result, result])
|
model_result = pd.concat([model_result, result])
|
||||||
#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
draw_confusion_matrix(y_test, y_pred, model)
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
draw_roc_curve(X_test, y_pred_prob, model)
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
|
@ -215,7 +260,7 @@ def pipeline_logreg_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||||
draw_prob_distribution(y_pred_prob, model)
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
draw_prob_distribution_companies(y_pred_prob, model)
|
draw_prob_distribution_companies(y_pred_prob, model)
|
||||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
save_model_s3('LogisticRegression_Benchmark', type_of_activity, model, pipeline)
|
save_model_s3('LogisticRegression_Benchmark', type_of_activity, type_of_model, model, pipeline)
|
||||||
return model_result
|
return model_result
|
||||||
|
|
||||||
|
|
||||||
|
@ -244,7 +289,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
|
||||||
"AUC" : [auc(fpr, tpr)]}
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
)
|
)
|
||||||
model_result = pd.concat([model_result, result])
|
model_result = pd.concat([model_result, result])
|
||||||
#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
draw_confusion_matrix(y_test, y_pred, model)
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
draw_roc_curve(X_test, y_pred_prob, model)
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
|
@ -252,7 +297,7 @@ def pipeline_logreg_cv(X_train, y_train, X_test, y_test, model_result):
|
||||||
draw_prob_distribution(y_pred_prob, model)
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
draw_prob_distribution_companies(y_pred_prob, model)
|
draw_prob_distribution_companies(y_pred_prob, model)
|
||||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
save_model_s3('LogisticRegression_cv', type_of_activity, model, grid_search)
|
save_model_s3('LogisticRegression_cv', type_of_activity, type_of_model, model, grid_search)
|
||||||
return model_result
|
return model_result
|
||||||
|
|
||||||
|
|
||||||
|
@ -276,7 +321,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||||
"AUC" : [auc(fpr, tpr)]}
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
)
|
)
|
||||||
model_result = pd.concat([model_result, result])
|
model_result = pd.concat([model_result, result])
|
||||||
#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
draw_confusion_matrix(y_test, y_pred, model)
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
draw_roc_curve(X_test, y_pred_prob, model)
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
|
@ -284,7 +329,7 @@ def pipeline_randomF_benchmark(X_train, y_train, X_test, y_test, model_result):
|
||||||
draw_prob_distribution(y_pred_prob, model)
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
draw_prob_distribution_companies(y_pred_prob, model)
|
draw_prob_distribution_companies(y_pred_prob, model)
|
||||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
save_model_s3('randomF_Benchmark', type_of_activity, model, pipeline)
|
save_model_s3('randomF_Benchmark', type_of_activity, type_of_model, model, pipeline)
|
||||||
return model_result
|
return model_result
|
||||||
|
|
||||||
|
|
||||||
|
@ -317,7 +362,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
|
||||||
"AUC" : [auc(fpr, tpr)]}
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
)
|
)
|
||||||
model_result = pd.concat([model_result, result])
|
model_result = pd.concat([model_result, result])
|
||||||
#compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
draw_confusion_matrix(y_test, y_pred, model)
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
draw_roc_curve(X_test, y_pred_prob, model)
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
|
@ -325,7 +370,7 @@ def pipeline_randomF_cv(X_train, y_train, X_test, y_test, model_result):
|
||||||
draw_prob_distribution(y_pred_prob, model)
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
draw_prob_distribution_companies(y_pred_prob, model)
|
draw_prob_distribution_companies(y_pred_prob, model)
|
||||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
save_model_s3('randomF_cv', type_of_activity, model, gridsearch)
|
save_model_s3('randomF_cv', type_of_activity, type_of_model, model, grid_search)
|
||||||
return model_result
|
return model_result
|
||||||
|
|
||||||
|
|
||||||
|
@ -350,9 +395,11 @@ def pipeline_naiveBayes_benchmark(X_train, y_train, X_test, y_test, model_result
|
||||||
"AUC" : [auc(fpr, tpr)]}
|
"AUC" : [auc(fpr, tpr)]}
|
||||||
)
|
)
|
||||||
model_result = pd.concat([model_result, result])
|
model_result = pd.concat([model_result, result])
|
||||||
|
compute_recall_companies(dataset_test, y_pred, type_of_activity, model)
|
||||||
|
|
||||||
draw_confusion_matrix(y_test, y_pred, model)
|
draw_confusion_matrix(y_test, y_pred, model)
|
||||||
draw_roc_curve(X_test, y_pred_prob, model)
|
draw_roc_curve(X_test, y_pred_prob, model)
|
||||||
draw_prob_distribution(y_pred_prob, model)
|
draw_prob_distribution(y_pred_prob, model)
|
||||||
draw_calibration_curve(X_test, y_pred_prob, model)
|
draw_calibration_curve(X_test, y_pred_prob, model)
|
||||||
save_model_s3('Naive_Bayes_Benchmark', type_of_activity, model, pipeline)
|
save_model_s3('Naive_Bayes_Benchmark', type_of_activity, type_of_model, model, pipeline)
|
||||||
return model_result
|
return model_result
|
|
@ -50,9 +50,20 @@ def load_files(nb_compagnie):
|
||||||
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
|
df_campaigns_kpi["customer_id"]= directory_path + '_' + df_campaigns_kpi['customer_id'].astype('str')
|
||||||
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
|
df_customerplus_clean["customer_id"]= directory_path + '_' + df_customerplus_clean['customer_id'].astype('str')
|
||||||
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
|
df_products_purchased_reduced["customer_id"]= directory_path + '_' + df_products_purchased_reduced['customer_id'].astype('str')
|
||||||
|
<<<<<<< HEAD
|
||||||
|
|
||||||
|
# Remove companies' outliers
|
||||||
|
df_tickets_kpi = remove_outlier_total_amount(df_tickets_kpi)
|
||||||
|
# harmonize set of customers across databases
|
||||||
|
customer_id = df_tickets_kpi['customer_id'].to_list()
|
||||||
|
for dataset in [df_campaigns_brut, df_campaigns_kpi, df_customerplus_clean, df_target_information]:
|
||||||
|
dataset = dataset[dataset['customer_id'].isin(customer_id)]
|
||||||
|
|
||||||
|
=======
|
||||||
df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str')
|
df_target_KPI["customer_id"]= directory_path + '_' + df_target_KPI['customer_id'].astype('str')
|
||||||
|
|
||||||
|
|
||||||
|
>>>>>>> main
|
||||||
# Concaténation
|
# Concaténation
|
||||||
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
|
customer = pd.concat([customer, df_customerplus_clean], ignore_index=True)
|
||||||
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
|
campaigns_kpi = pd.concat([campaigns_kpi, df_campaigns_kpi], ignore_index=True)
|
||||||
|
@ -64,6 +75,16 @@ def load_files(nb_compagnie):
|
||||||
return customer, campaigns_kpi, campaigns_brut, tickets, products, targets
|
return customer, campaigns_kpi, campaigns_brut, tickets, products, targets
|
||||||
|
|
||||||
|
|
||||||
|
def remove_outlier_total_amount(tickets):
|
||||||
|
Q1 = tickets['total_amount'].quantile(0.25)
|
||||||
|
Q3 = tickets['total_amount'].quantile(0.75)
|
||||||
|
IQR = Q3 - Q1
|
||||||
|
upper = Q3 +1.5*IQR
|
||||||
|
outliers = tickets[tickets['total_amount'] > upper]['customer_id'].to_list()
|
||||||
|
tickets = tickets[~tickets['customer_id'].isin(outliers)]
|
||||||
|
return tickets
|
||||||
|
|
||||||
|
|
||||||
def save_file_s3(File_name, type_of_activity):
|
def save_file_s3(File_name, type_of_activity):
|
||||||
image_buffer = io.BytesIO()
|
image_buffer = io.BytesIO()
|
||||||
plt.savefig(image_buffer, format='png')
|
plt.savefig(image_buffer, format='png')
|
||||||
|
@ -133,7 +154,7 @@ def compute_nb_clients(customer, type_of_activity):
|
||||||
|
|
||||||
plt.xlabel('Company')
|
plt.xlabel('Company')
|
||||||
plt.ylabel("Number of clients (thousands)")
|
plt.ylabel("Number of clients (thousands)")
|
||||||
plt.title(f"Number of clients for {type_of_activity}")
|
plt.title(f"Number of clients Across {type_of_activity} Companies")
|
||||||
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
|
plt.xticks(company_nb_clients["number_company"], ["{}".format(i) for i in company_nb_clients["number_company"]])
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("nb_clients_", type_of_activity)
|
save_file_s3("nb_clients_", type_of_activity)
|
||||||
|
@ -143,22 +164,35 @@ def maximum_price_paid(customer, type_of_activity):
|
||||||
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
|
company_max_price = customer.groupby("number_company")["max_price"].max().reset_index()
|
||||||
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
|
plt.bar(company_max_price["number_company"], company_max_price["max_price"])
|
||||||
|
|
||||||
plt.xlabel('Company')
|
plt.xlabel('Company Number')
|
||||||
plt.ylabel("Maximal price of a ticket Prix")
|
plt.ylabel("Maximal price of a ticket Prix")
|
||||||
plt.title(f"Maximal price of a ticket for {type_of_activity}")
|
plt.title(f"Maximal price of a ticket Across {type_of_activity} Companies")
|
||||||
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
|
plt.xticks(company_max_price["number_company"], ["{}".format(i) for i in company_max_price["number_company"]])
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("Maximal_price_", type_of_activity)
|
save_file_s3("Maximal_price_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
|
def target_proportion(customer, type_of_activity):
|
||||||
|
df_y = customer.groupby(["number_company"]).agg({"has_purchased_target_period" : 'sum',
|
||||||
|
'customer_id' : 'nunique'}).reset_index()
|
||||||
|
df_y['prop_has_purchased_target_period'] = (df_y["has_purchased_target_period"]/df_y['customer_id'])*100
|
||||||
|
plt.bar(df_y["number_company"], df_y["prop_has_purchased_target_period"])
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel('Share (%)')
|
||||||
|
plt.title(f'Share of Customers who Bought during the Target Period Across {type_of_activity} Companies')
|
||||||
|
plt.xticks(df_y["number_company"], ["{}".format(i) for i in df_y["number_company"]])
|
||||||
|
plt.show()
|
||||||
|
save_file_s3("share_target_", type_of_activity)
|
||||||
|
|
||||||
|
|
||||||
def mailing_consent(customer, type_of_activity):
|
def mailing_consent(customer, type_of_activity):
|
||||||
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
|
mailing_consent = customer.groupby("number_company")["opt_in"].mean().reset_index()
|
||||||
|
mailing_consent["opt_in"] *= 100
|
||||||
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
|
plt.bar(mailing_consent["number_company"], mailing_consent["opt_in"])
|
||||||
|
|
||||||
plt.xlabel('Company')
|
plt.xlabel('Company Number')
|
||||||
plt.ylabel('Consent')
|
plt.ylabel('Mailing Consent (%)')
|
||||||
plt.title(f'Consent of mailing for {type_of_activity}')
|
plt.title(f'Consent of mailing Across {type_of_activity} Companies')
|
||||||
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
|
plt.xticks(mailing_consent["number_company"], ["{}".format(i) for i in mailing_consent["number_company"]])
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("mailing_consent_", type_of_activity)
|
save_file_s3("mailing_consent_", type_of_activity)
|
||||||
|
@ -178,16 +212,16 @@ def mailing_consent_by_target(customer):
|
||||||
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
|
label_data = df_graph[df_graph['has_purchased_target_period'] == label]
|
||||||
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
||||||
|
|
||||||
label_printed = "purchased" if label else "no purchase"
|
label_printed = "Purchase" if label else "No purchase"
|
||||||
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
||||||
|
|
||||||
# Mise à jour des positions des barres pour le prochain groupe
|
# Mise à jour des positions des barres pour le prochain groupe
|
||||||
bar_positions = [pos + bar_width for pos in bar_positions]
|
bar_positions = [pos + bar_width for pos in bar_positions]
|
||||||
|
|
||||||
# Ajout des étiquettes, de la légende, etc.
|
# Ajout des étiquettes, de la légende, etc.
|
||||||
ax.set_xlabel('Company')
|
ax.set_xlabel('Company Number')
|
||||||
ax.set_ylabel('Consent')
|
ax.set_ylabel('Mailing Consent (%)')
|
||||||
ax.set_title(f'Consent of mailing according to target for {type_of_activity}')
|
ax.set_title(f'Consent of mailing according to target Across {type_of_activity} Companies')
|
||||||
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
||||||
ax.set_xticklabels(categories)
|
ax.set_xticklabels(categories)
|
||||||
ax.legend()
|
ax.legend()
|
||||||
|
@ -200,16 +234,20 @@ def mailing_consent_by_target(customer):
|
||||||
def gender_bar(customer, type_of_activity):
|
def gender_bar(customer, type_of_activity):
|
||||||
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
|
company_genders = customer.groupby("number_company")[["gender_male", "gender_female", "gender_other"]].mean().reset_index()
|
||||||
|
|
||||||
# Création du barplot
|
company_genders["gender_male"] *= 100
|
||||||
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Homme")
|
company_genders["gender_female"] *= 100
|
||||||
plt.bar(company_genders["number_company"], company_genders["gender_female"],
|
company_genders["gender_other"] *= 100
|
||||||
bottom = company_genders["gender_male"], label = "Femme")
|
|
||||||
plt.bar(company_genders["number_company"], company_genders["gender_other"],
|
|
||||||
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Inconnu")
|
|
||||||
|
|
||||||
plt.xlabel('Company')
|
# Création du barplot
|
||||||
plt.ylabel("Gender")
|
plt.bar(company_genders["number_company"], company_genders["gender_male"], label = "Male")
|
||||||
plt.title(f"Gender of Customer for {type_of_activity}")
|
plt.bar(company_genders["number_company"], company_genders["gender_female"],
|
||||||
|
bottom = company_genders["gender_male"], label = "Female")
|
||||||
|
plt.bar(company_genders["number_company"], company_genders["gender_other"],
|
||||||
|
bottom = company_genders["gender_male"] + company_genders["gender_female"], label = "Unknown")
|
||||||
|
|
||||||
|
plt.xlabel('Company Number')
|
||||||
|
plt.ylabel("Frequency (%)")
|
||||||
|
plt.title(f"Gender Distribution of Customers Across {type_of_activity} Companies")
|
||||||
plt.legend()
|
plt.legend()
|
||||||
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
|
plt.xticks(company_genders["number_company"], ["{}".format(i) for i in company_genders["number_company"]])
|
||||||
plt.show()
|
plt.show()
|
||||||
|
@ -218,11 +256,12 @@ def gender_bar(customer, type_of_activity):
|
||||||
|
|
||||||
def country_bar(customer, type_of_activity):
|
def country_bar(customer, type_of_activity):
|
||||||
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
|
company_country_fr = customer.groupby("number_company")["country_fr"].mean().reset_index()
|
||||||
|
company_country_fr["country_fr"] *= 100
|
||||||
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
|
plt.bar(company_country_fr["number_company"], company_country_fr["country_fr"])
|
||||||
|
|
||||||
plt.xlabel('Company')
|
plt.xlabel('Company Number')
|
||||||
plt.ylabel("Share of French Customer")
|
plt.ylabel("Share of French Customer (%)")
|
||||||
plt.title(f"Share of French Customer for {type_of_activity}")
|
plt.title(f"Share of French Customer Across {type_of_activity} Companies")
|
||||||
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
|
plt.xticks(company_country_fr["number_company"], ["{}".format(i) for i in company_country_fr["number_company"]])
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("country_bar_", type_of_activity)
|
save_file_s3("country_bar_", type_of_activity)
|
||||||
|
@ -232,9 +271,8 @@ def lazy_customer_plot(campaigns_kpi, type_of_activity):
|
||||||
company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
|
company_lazy_customers = campaigns_kpi.groupby("number_company")["nb_campaigns_opened"].mean().reset_index()
|
||||||
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
|
plt.bar(company_lazy_customers["number_company"], company_lazy_customers["nb_campaigns_opened"])
|
||||||
|
|
||||||
plt.xlabel('Company')
|
plt.xlabel('Company Number')
|
||||||
plt.ylabel("Share of Customers who did not open mail")
|
plt.title(f"Share of Customers who did not Open Mail Across {type_of_activity} Companies")
|
||||||
plt.title(f"Share of Customers who did not open mail for {type_of_activity}")
|
|
||||||
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
|
plt.xticks(company_lazy_customers["number_company"], ["{}".format(i) for i in company_lazy_customers["number_company"]])
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("lazy_customer_", type_of_activity)
|
save_file_s3("lazy_customer_", type_of_activity)
|
||||||
|
@ -255,16 +293,16 @@ def campaigns_effectiveness(customer, type_of_activity):
|
||||||
label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
|
label_data = campaigns_effectiveness[campaigns_effectiveness['has_purchased_target_period'] == label]
|
||||||
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
values = [label_data[label_data['number_company'] == category]['opt_in'].values[0]*100 for category in categories]
|
||||||
|
|
||||||
label_printed = "purchased" if label else "no purchase"
|
label_printed = "Purchase" if label else "No purchase"
|
||||||
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
ax.bar(bar_positions, values, bar_width, label=label_printed)
|
||||||
|
|
||||||
# Mise à jour des positions des barres pour le prochain groupe
|
# Mise à jour des positions des barres pour le prochain groupe
|
||||||
bar_positions = [pos + bar_width for pos in bar_positions]
|
bar_positions = [pos + bar_width for pos in bar_positions]
|
||||||
|
|
||||||
# Ajout des étiquettes, de la légende, etc.
|
# Ajout des étiquettes, de la légende, etc.
|
||||||
ax.set_xlabel('Company')
|
ax.set_xlabel('Company Number')
|
||||||
ax.set_ylabel('Consent')
|
ax.set_ylabel('Share of Consent (%)')
|
||||||
ax.set_title(f"Number of Customers who have consent to received mails for {type_of_activity} dependy on target")
|
ax.set_title(f"Proportion of customers who have given their consent to receive emails, by customer class ({type_of_activity} companies)")
|
||||||
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
ax.set_xticks([pos + bar_width / 2 for pos in np.arange(len(categories))])
|
||||||
ax.set_xticklabels(categories)
|
ax.set_xticklabels(categories)
|
||||||
ax.legend()
|
ax.legend()
|
||||||
|
@ -315,9 +353,9 @@ def sale_dynamics(products, campaigns_brut, type_of_activity):
|
||||||
|
|
||||||
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
|
merged_data = pd.merge(purchases_graph_used_0, purchases_graph_used_1, on="purchase_date_month", suffixes=("_new", "_old"))
|
||||||
|
|
||||||
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="Nouveau client")
|
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_new"], width=12, label="New Customers")
|
||||||
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
|
plt.bar(merged_data["purchase_date_month"], merged_data["nb_purchases_old"],
|
||||||
bottom=merged_data["nb_purchases_new"], width=12, label="Ancien client")
|
bottom=merged_data["nb_purchases_new"], width=12, label="Existing Customers")
|
||||||
|
|
||||||
|
|
||||||
# commande pr afficher slt
|
# commande pr afficher slt
|
||||||
|
@ -325,7 +363,7 @@ def sale_dynamics(products, campaigns_brut, type_of_activity):
|
||||||
|
|
||||||
plt.xlabel('Month')
|
plt.xlabel('Month')
|
||||||
plt.ylabel("Number of Sales")
|
plt.ylabel("Number of Sales")
|
||||||
plt.title(f"Number of Sales for {type_of_activity}")
|
plt.title(f"Number of Sales Across {type_of_activity} Companies")
|
||||||
plt.legend()
|
plt.legend()
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("sale_dynamics_", type_of_activity)
|
save_file_s3("sale_dynamics_", type_of_activity)
|
||||||
|
@ -333,12 +371,12 @@ def sale_dynamics(products, campaigns_brut, type_of_activity):
|
||||||
|
|
||||||
def tickets_internet(tickets, type_of_activity):
|
def tickets_internet(tickets, type_of_activity):
|
||||||
nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
|
nb_tickets_internet = tickets.groupby("number_company")['prop_purchases_internet'].mean().reset_index()
|
||||||
|
nb_tickets_internet['prop_purchases_internet'] *=100
|
||||||
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["prop_purchases_internet"])
|
plt.bar(nb_tickets_internet["number_company"], nb_tickets_internet["prop_purchases_internet"])
|
||||||
|
|
||||||
plt.xlabel('Company')
|
plt.xlabel('Company Number')
|
||||||
plt.ylabel("Share of Purchases Bought Online")
|
plt.ylabel("Share of Purchases Bought Online (%)")
|
||||||
plt.title(f"Share of Purchases Bought Online for {type_of_activity}")
|
plt.title(f"Share of Online Purchases Across {type_of_activity} Companies")
|
||||||
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
|
plt.xticks(nb_tickets_internet["number_company"], ["{}".format(i) for i in nb_tickets_internet["number_company"]])
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("tickets_internet_", type_of_activity)
|
save_file_s3("tickets_internet_", type_of_activity)
|
||||||
|
@ -348,13 +386,13 @@ def already_bought_online(tickets, type_of_activity):
|
||||||
nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
|
nb_consumers_online = (tickets.groupby("number_company").agg({'achat_internet' : 'sum',
|
||||||
'customer_id' : 'nunique'}
|
'customer_id' : 'nunique'}
|
||||||
).reset_index())
|
).reset_index())
|
||||||
nb_consumers_online["Share_consumers_internet"] = nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"]
|
nb_consumers_online["Share_consumers_internet"] = (nb_consumers_online["achat_internet"]/ nb_consumers_online["customer_id"])*100
|
||||||
|
|
||||||
plt.bar(nb_consumers_online["number_company"], nb_consumers_online["Share_consumers_internet"])
|
plt.bar(nb_consumers_online["number_company"], nb_consumers_online["Share_consumers_internet"])
|
||||||
|
|
||||||
plt.xlabel('Company')
|
plt.xlabel('Company Number')
|
||||||
plt.ylabel("Share of Customer who Bought Online at least once")
|
plt.ylabel("Share of Customer who Bought Online at least once (%)")
|
||||||
plt.title(f"Share of Customer who Bought Online at least once for {type_of_activity}")
|
plt.title(f"Share of Customer who Bought Online at least once Across {type_of_activity} Companies")
|
||||||
plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
|
plt.xticks(nb_consumers_online["number_company"], ["{}".format(i) for i in nb_consumers_online["number_company"]])
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("First_buy_internet_", type_of_activity)
|
save_file_s3("First_buy_internet_", type_of_activity)
|
||||||
|
@ -363,7 +401,7 @@ def already_bought_online(tickets, type_of_activity):
|
||||||
def box_plot_price_tickets(tickets, type_of_activity):
|
def box_plot_price_tickets(tickets, type_of_activity):
|
||||||
price_tickets = tickets[(tickets['total_amount'] > 0)]
|
price_tickets = tickets[(tickets['total_amount'] > 0)]
|
||||||
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
|
sns.boxplot(data=price_tickets, y="total_amount", x="number_company", showfliers=False, showmeans=True)
|
||||||
plt.title(f"Box plot of price tickets for {type_of_activity}")
|
plt.title(f"Box plot of price tickets Across {type_of_activity} Companies")
|
||||||
plt.show()
|
plt.show()
|
||||||
save_file_s3("box_plot_price_tickets_", type_of_activity)
|
save_file_s3("box_plot_price_tickets_", type_of_activity)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user