# importations
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
import s3fs
import matplotlib.pyplot as plt
from scipy.optimize import fsolve
import pickle
import warnings
import io

# functions

def load_train_test(type_of_activity):
    # BUCKET = f"projet-bdc2324-team1/Generalization/{type_of_activity}"
    BUCKET = f"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/{type_of_activity}"
    File_path_train = BUCKET + "/Train_set.csv"
    File_path_test = BUCKET + "/Test_set.csv"
    
    with fs.open( File_path_train, mode="rb") as file_in:
        dataset_train = pd.read_csv(file_in, sep=",")
        # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)

    with fs.open(File_path_test, mode="rb") as file_in:
        dataset_test = pd.read_csv(file_in, sep=",")
        # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)
    
    return dataset_train, dataset_test


def features_target_split(dataset_train, dataset_test):
    
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', 
            'time_between_purchase', 'fidelity',  'is_email_true', 'opt_in', #'is_partner', 'nb_tickets_internet',
            'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']

    # we suppress fidelity, time between purchase, and gender other (colinearity issue)
    """
    features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 
                  'purchase_date_min', 'purchase_date_max', 'nb_tickets_internet',  'is_email_true', 
                  'opt_in', 'gender_female', 'gender_male', 'nb_campaigns', 'nb_campaigns_opened']
    """
    
    X_train = dataset_train # [features_l]
    y_train = dataset_train[['y_has_purchased']]

    X_test = dataset_test # [features_l]
    y_test = dataset_test[['y_has_purchased']]
    
    return X_train, X_test, y_train, y_test
    

def load_model(type_of_activity, model):
    # BUCKET = f"projet-bdc2324-team1/Output_model/{type_of_activity}/{model}/"
    BUCKET = f"projet-bdc2324-team1/2_Output/2_1_Modeling_results/standard/{type_of_activity}/{model}/"
    filename = model + '.pkl'
    file_path = BUCKET + filename
    with fs.open(file_path, mode="rb") as f:
        model_bytes = f.read()

    model = pickle.loads(model_bytes)
    return model
    

def df_segment(df, y, model) :

    y_pred = model.predict(df)
    y_pred_prob = model.predict_proba(df)[:, 1]

    df_segment = df

    df_segment["has_purchased"] = y
    df_segment["has_purchased_estim"] = y_pred
    df_segment["score"] = y_pred_prob
    df_segment["quartile"] = np.where(df_segment['score']<0.25, '1',
                       np.where(df_segment['score']<0.5, '2',
                       np.where(df_segment['score']<0.75, '3', '4')))

    return df_segment
    

def odd_ratio(score) :
    """
    Args:
    - score (Union[float, int]): Score value.

    Returns:
    float: Odd ratio value.
    """
    
    return score / (1 - score)


def adjust_score_1(score) :
    """
    Adjust scores by replacing ones with the second highest value.
    Allows to compute odd ratios then.

    Args:
    - score (List[Union[float, int]]): List of score values.

    Returns:
    np.ndarray: Adjusted score values.
    """
    
    second_best_score = np.array([element for element in score if element !=1]).max()
    new_score = np.array([element if element!=1 else second_best_score for element in score])    
    return new_score


def adjusted_score(odd_ratio, bias) :
    """
    Adjust the score based on the odd ratio and bias.

    Args:
    - odd_ratio (Union[float, int]): Odd ratio value.
    - bias (Union[float, int]): Bias value.

    Returns:
    float: Adjusted score value.
    """
    
    adjusted_score = odd_ratio/(bias+odd_ratio)
    return adjusted_score


def find_bias(odd_ratios, y_objective, initial_guess=10) :
    """
    Find the bias needed to adjust scores according to the purchases observed

    Args:
    - odd_ratios (List[float]): List of odd ratios.
    - y_objective (Union[float, int]): Objective value to achieve.
    - initial_guess (Union[float, int], optional): Initial guess for the bias. Default is 6.

    Returns:
    float: Estimated bias value.
    """

    bias_estimated = fsolve(lambda bias : sum([adjusted_score(element, bias) for element in list(odd_ratios)]) - y_objective, x0=initial_guess)
 
    return bias_estimated[0]
    
    
def plot_hist_scores(df, score, score_adjusted, type_of_activity) :
    """
    Plot a histogram comparing scores and adjusted scores.

    Args:
    - df (DataFrame): DataFrame containing score data.
    - score (str): Name of the column in df representing the original scores.
    - score_adjusted (str): Name of the column in df representing the adjusted scores.
    - type_of_activity (str) : type of activity of the companies considered.

    Returns:
    None
    """

    plt.figure()
    plt.hist(df[score], label = "score", alpha=0.6)
    plt.hist(df[score_adjusted], label="adjusted score", alpha=0.6)
    plt.legend()
    plt.xlabel("probability of a future purchase")
    plt.ylabel("count")
    plt.title(f"Comparison between score and adjusted score for {type_of_activity} companies")
    # plt.show()


def project_tickets_CA (df, nb_purchases, nb_tickets, total_amount, score_adjusted, duration_ref, duration_projection) : 
    """
    Project ticket counts and total amount for a given duration and adjust based on a score.

    Args:
    - df (DataFrame): DataFrame containing ticket data.
    - nb_purchases (str) : Name of the column in df representing the number of purchases.
    - nb_tickets (str): Name of the column in df representing the number of tickets.
    - total_amount (str): Name of the column in df representing the total amount.
    - score_adjusted (str): Name of the column in df representing the adjusted score.
    - duration_ref (int or float): duration of the period of reference for the construction of the variables X.
    - duration_projection (int or float): Duration of the period of projection of sales / revenue. 

    Returns:
    DataFrame: DataFrame with projected ticket counts and total amount adjusted based on the score.
    duration_ratio = duration_ref/duration_projection
    """

    duration_ratio = duration_ref/duration_projection

    df_output = df

    df_output.loc[:,"nb_tickets_projected"] = df_output.loc[:,nb_tickets] / duration_ratio
    df_output.loc[:,"total_amount_projected"] = df_output.loc[:,total_amount] / duration_ratio
    
    df_output.loc[:,"nb_tickets_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"nb_tickets_projected"]
    df_output.loc[:,"total_amount_expected"] = df_output.loc[:,score_adjusted] * df_output.loc[:,"total_amount_projected"]

    df_output.loc[:,"pace_purchase"] = (duration_ref/df_output.loc[:,nb_purchases]).apply(lambda x : np.nan if x==np.inf else x)
    
    return df_output
    

def summary_expected_CA(df, segment, nb_tickets_expected, total_amount_expected, total_amount, pace_purchase,
                       duration_ref=17, duration_projection=12) :  
    """
    Generate a summary of expected customer acquisition based on segments.

    Args:
    - df (DataFrame): DataFrame containing customer data.
    - segment (str): Name of the column in df representing customer segments.
    - nb_tickets_expected (str): Name of the column in df representing the expected number of tickets.
    - total_amount_expected (str): Name of the column in df representing the expected total amount.
    - total_amount (str): Name of the column in df representing the total amount.

    Returns:
    DataFrame: Summary DataFrame containing expected customer acquisition metrics.
    """
    
    # compute nb tickets estimated and total amount expected
    df_expected_CA = df.groupby(segment)[[nb_tickets_expected, total_amount_expected]].sum().reset_index()
    
    # number of customers by segment
    df_expected_CA.insert(1, "size", df.groupby(segment).size().values)
    
    # size in percent of all customers
    df_expected_CA.insert(2, "size_perct", 100 * df_expected_CA["size"]/df_expected_CA["size"].sum())
    
    # compute share of CA recovered
    duration_ratio=duration_ref/duration_projection
    
    df_expected_CA["revenue_recovered_perct"] = 100 * duration_ratio * df_expected_CA[total_amount_expected] / \
    df.groupby(segment)[total_amount].sum().values

    df_drop_null_pace = df.dropna(subset=[pace_purchase])
    df_expected_CA["pace_purchase"] = df_drop_null_pace.groupby(segment)[pace_purchase].mean().values
    
    return df_expected_CA


def save_file_s3_ca(File_name, type_of_activity):
    image_buffer = io.BytesIO()
    plt.savefig(image_buffer, format='png', dpi=120)
    image_buffer.seek(0)
    PATH = f"projet-bdc2324-team1/2_Output/2_3_Sales_Forecast/{type_of_activity}/"
    FILE_PATH_OUT_S3 = PATH + File_name + type_of_activity + '.png'
    with fs.open(FILE_PATH_OUT_S3, 'wb') as s3_file:
        s3_file.write(image_buffer.read())
    plt.close()