update CA segment analysis
This commit is contained in:
parent
c1cb3ab396
commit
ca30d1daa3
77
0_7_CA_segment.py
Normal file
77
0_7_CA_segment.py
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
# importations
|
||||||
|
import pandas as pd
|
||||||
|
from pandas import DataFrame
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import s3fs
|
||||||
|
import re
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
|
||||||
|
from sklearn.utils import class_weight
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.compose import ColumnTransformer
|
||||||
|
from sklearn.preprocessing import OneHotEncoder
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler
|
||||||
|
from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
|
||||||
|
from sklearn.exceptions import ConvergenceWarning, DataConversionWarning
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
from scipy.optimize import fsolve
|
||||||
|
import pickle
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
# define type of activity
|
||||||
|
type_of_activity = "sport"
|
||||||
|
PATH = f"projet-bdc2324-team1/Output_expected_CA/{type_of_activity}/"
|
||||||
|
|
||||||
|
# comparison with bias of the train set - X train to be defined
|
||||||
|
X_train_score = logit_cv.predict_proba(X_train)[:, 1]
|
||||||
|
|
||||||
|
bias_train_set = find_bias(odd_ratios = odd_ratio(adjust_score_1(X_train_score)),
|
||||||
|
y_objective = y_train["y_has_purchased"].sum(),
|
||||||
|
initial_guess=6)
|
||||||
|
|
||||||
|
score_adjusted_train = adjusted_score(odd_ratio(adjust_score_1(X_test_segment["score"])), bias = bias_train_set)
|
||||||
|
X_test_segment["score_adjusted"] = score_adjusted_train
|
||||||
|
|
||||||
|
|
||||||
|
# plot adjusted scores and save (to be tested)
|
||||||
|
plot_hist_scores(X_test_segment, score = "score", score_adjusted = "score_adjusted")
|
||||||
|
|
||||||
|
image_buffer = io.BytesIO()
|
||||||
|
plt.savefig(image_buffer, format='png')
|
||||||
|
image_buffer.seek(0)
|
||||||
|
file_name = "hist_score_adjusted"
|
||||||
|
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".png"
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
|
||||||
|
s3_file.write(image_buffer.read())
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# comparison between score and adjusted score
|
||||||
|
X_test_table_adjusted_scores = X_test_segment.groupby("quartile")[["score","score_adjusted", "has_purchased"]].mean().reset_index().round(2)
|
||||||
|
|
||||||
|
file_name = "table_adjusted_score"
|
||||||
|
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
|
X_test_table_adjusted_scores.to_csv(file_out, index = False)
|
||||||
|
|
||||||
|
|
||||||
|
# project revenue
|
||||||
|
X_test_segment = project_tickets_CA (X_test_segment, "nb_tickets", "total_amount", "score_adjusted", duration_ref=1.5, duration_projection=1)
|
||||||
|
|
||||||
|
|
||||||
|
# table summarizing projections
|
||||||
|
X_test_expected_CA = round(summary_expected_CA(df=X_test_segment, segment="quartile", nb_tickets_expected="nb_tickets_expected", total_amount_expected="total_amount_expected", total_amount="total_amount"),2)
|
||||||
|
|
||||||
|
file_name = "table_expected_CA"
|
||||||
|
FILE_PATH_OUT_S3 = PATH + file_name + type_of_activity + ".csv"
|
||||||
|
with fs.open(FILE_PATH_OUT_S3, 'w') as file_out:
|
||||||
|
X_test_expected_CA.to_csv(file_out, index = False)
|
||||||
|
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,7 +1,5 @@
|
||||||
def odd_ratio(score) :
|
def odd_ratio(score) :
|
||||||
"""
|
"""
|
||||||
Calculate the odd ratio from a score.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
- score (Union[float, int]): Score value.
|
- score (Union[float, int]): Score value.
|
||||||
|
|
||||||
|
@ -102,13 +100,15 @@ def project_tickets_CA (df, nb_tickets, total_amount, score_adjusted, duration_r
|
||||||
duration_ratio = duration_ref/duration_projection
|
duration_ratio = duration_ref/duration_projection
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
duration_ratio = duration_ref/duration_projection
|
||||||
|
|
||||||
df_output = df
|
df_output = df
|
||||||
|
|
||||||
df_output["nb_tickets_projected"] = df_output[nb_tickets] / duration_ratio
|
df_output.loc[:,"nb_tickets_projected"] = df_output.loc[:,nb_tickets] / duration_ratio
|
||||||
df_output["total_amount_projected"] = df_output[total_amount] / duration_ratio
|
df_output.loc[:,"total_amount_projected"] = df_output.loc[:,total_amount] / duration_ratio
|
||||||
|
|
||||||
df_output["nb_tickets_expected"] = df_output[score_adjusted] * df_output["nb_tickets_projected"]
|
df_output.loc[:,"nb_tickets_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"nb_tickets_projected"]
|
||||||
df_output["total_amount_expected"] = df_output[score_adjusted] * df_output["total_amount_projected"]
|
df_output.loc[:,"total_amount_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"total_amount_projected"]
|
||||||
|
|
||||||
return df_output
|
return df_output
|
||||||
|
|
||||||
|
@ -138,8 +138,6 @@ def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected,
|
||||||
df_expected_CA.insert(2, "size_perct", 100 * df_expected_CA["size"]/df_expected_CA["size"].sum())
|
df_expected_CA.insert(2, "size_perct", 100 * df_expected_CA["size"]/df_expected_CA["size"].sum())
|
||||||
|
|
||||||
# compute share of CA recovered
|
# compute share of CA recovered
|
||||||
duration_ref=1.5
|
|
||||||
duration_projection=1
|
|
||||||
duration_ratio=duration_ref/duration_projection
|
duration_ratio=duration_ref/duration_projection
|
||||||
|
|
||||||
df_expected_CA["perct_revenue_recovered"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
|
df_expected_CA["perct_revenue_recovered"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
|
||||||
|
|
Loading…
Reference in New Issue
Block a user