From 09f4bd3fe4066f19a07350f3861aea153281da5b Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Thu, 4 Apr 2024 06:50:49 +0000 Subject: [PATCH] push coefficient --- useless/Computes_log_coeff.ipynb | 436 +++++++++++++++++++++++++++++++ 1 file changed, 436 insertions(+) create mode 100644 useless/Computes_log_coeff.ipynb diff --git a/useless/Computes_log_coeff.ipynb b/useless/Computes_log_coeff.ipynb new file mode 100644 index 0000000..3c83cbc --- /dev/null +++ b/useless/Computes_log_coeff.ipynb @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "135a67de-cff8-4345-bacc-d9f9fa68a41f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "import s3fs\n", + "import re\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score\n", + "from sklearn.utils import class_weight\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler\n", + "from sklearn.metrics import make_scorer, f1_score, balanced_accuracy_score\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", + "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n", + "\n", + "import statsmodels.api as sm\n", + "\n", + "import pickle\n", + "import warnings" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9a6254df-d496-4957-89ea-9ed2b74049dd", + "metadata": {}, + "outputs": [], + "source": [ + "# Create filesystem object\n", + "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", + "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "922cf05f-8343-4ed0-ad62-3ef1f17c0730", + "metadata": {}, + "outputs": [], + "source": [ + "def load_train_test():\n", + " BUCKET = \"projet-bdc2324-team1/1_Temp/1_0_Modelling_Datasets/musee\"\n", + " File_path_train = BUCKET + \"/Train_set.csv\"\n", + " File_path_test = BUCKET + \"/Test_set.csv\"\n", + " \n", + " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", + " dataset_train = pd.read_csv(file_in, sep=\",\")\n", + " # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", + "\n", + " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", + " dataset_test = pd.read_csv(file_in, sep=\",\")\n", + " # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", + " \n", + " return dataset_train, dataset_test\n", + "\n", + "\n", + "def features_target_split(dataset_train, dataset_test):\n", + " features_l = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'time_to_open',\n", + " 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',\n", + " 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',\n", + " 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'gender_female', 'gender_male',\n", + " 'achat_internet', 'categorie_age_0_10', 'categorie_age_10_20', 'categorie_age_20_30','categorie_age_30_40',\n", + " 'categorie_age_40_50', 'categorie_age_50_60', 'categorie_age_60_70', 'categorie_age_70_80', 'categorie_age_plus_80','categorie_age_inconnue',\n", + " 'country_fr', 'is_profession_known', 'is_zipcode_known', 'opt_in', 'target_optin', 'target_newsletter', 'target_scolaire', 'target_entreprise', 'target_famille',\n", + " 'target_jeune', 'target_abonne']\n", + " X_train = dataset_train[features_l]\n", + " y_train = dataset_train[['y_has_purchased']]\n", + "\n", + " X_test = dataset_test[features_l]\n", + " y_test = dataset_test[['y_has_purchased']]\n", + " return X_train, X_test, y_train, y_test" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2584e454-111b-4c39-881b-676841cb5aa1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_498/3950829189.py:7: DtypeWarning: Columns (10,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " dataset_train = pd.read_csv(file_in, sep=\",\")\n", + "/tmp/ipykernel_498/3950829189.py:11: DtypeWarning: Columns (10,24,25) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " dataset_test = pd.read_csv(file_in, sep=\",\")\n" + ] + } + ], + "source": [ + "dataset_train, dataset_test = load_train_test()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a32ea7f8-e2d3-44db-8937-5afda9447b58", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3bdc8840-7f45-416f-8ee0-307db201c496", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "const 0\n", + "nb_campaigns 0\n", + "taux_ouverture_mail 0\n", + "prop_purchases_internet 0\n", + "nb_tickets 0\n", + "nb_purchases 0\n", + "total_amount 0\n", + "nb_suppliers 0\n", + "time_to_open 0\n", + "purchases_10_2021 0\n", + "purchases_10_2022 0\n", + "purchases_11_2021 0\n", + "purchases_12_2021 0\n", + "purchases_1_2022 0\n", + "purchases_2_2022 0\n", + "purchases_3_2022 0\n", + "purchases_4_2022 0\n", + "purchases_5_2021 0\n", + "purchases_5_2022 0\n", + "purchases_6_2021 0\n", + "purchases_6_2022 0\n", + "purchases_7_2021 0\n", + "purchases_7_2022 0\n", + "purchases_8_2021 0\n", + "purchases_8_2022 0\n", + "purchases_9_2021 0\n", + "purchases_9_2022 0\n", + "purchase_date_min 0\n", + "purchase_date_max 0\n", + "nb_targets 0\n", + "gender_female 0\n", + "gender_male 0\n", + "achat_internet 0\n", + "categorie_age_0_10 0\n", + "categorie_age_10_20 0\n", + "categorie_age_20_30 0\n", + "categorie_age_30_40 0\n", + "categorie_age_40_50 0\n", + "categorie_age_50_60 0\n", + "categorie_age_60_70 0\n", + "categorie_age_70_80 0\n", + "categorie_age_plus_80 0\n", + "categorie_age_inconnue 0\n", + "country_fr 0\n", + "is_profession_known 0\n", + "is_zipcode_known 0\n", + "opt_in 0\n", + "target_optin 0\n", + "target_newsletter 0\n", + "target_scolaire 0\n", + "target_entreprise 0\n", + "target_famille 0\n", + "target_jeune 0\n", + "target_abonne 0\n", + "dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "3c3ac545-52e0-4d0c-afdc-fff70f468a94", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "most_frequent_value = X_train['country_fr'].mode()[0]\n", + "most_frequent_value" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "0fcdc5ee-bcea-4436-be9b-92b79d27a230", + "metadata": {}, + "outputs": [], + "source": [ + "X_train['country_fr'] = X_train['country_fr'].fillna(most_frequent_value)\n", + "X_train['time_to_open'] = X_train['time_to_open'].fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7ecdaf1a-b5e4-4880-871e-363eae6fe4e1", + "metadata": {}, + "outputs": [], + "source": [ + "weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n", + " y = y_train['y_has_purchased'])\n", + "\n", + "weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a6b56090-cfe9-4772-810c-d36bf12aceca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.52239696, 0.52239696, 0.52239696, ..., 0.52239696, 0.52239696,\n", + " 0.52239696])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "class_counts = np.bincount(y_train['y_has_purchased'])\n", + "class_weights = len(y_train['y_has_purchased']) / (2 * class_counts)\n", + "\n", + "weights = class_weights[y_train['y_has_purchased'].values.astype(int)]\n", + "weights" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "bfaea23e-7d7a-4c0d-96f6-4ab4c7c2ff51", + "metadata": {}, + "outputs": [], + "source": [ + "X_train = sm.add_constant(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "4cf97ae5-9dcf-4f4c-91b3-3b1f339a6213", + "metadata": {}, + "outputs": [], + "source": [ + "numeric_features = ['nb_campaigns', 'taux_ouverture_mail', 'prop_purchases_internet', 'nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n", + " 'purchases_10_2021','purchases_10_2022', 'purchases_11_2021', 'purchases_12_2021','purchases_1_2022', 'purchases_2_2022', 'purchases_3_2022',\n", + " 'purchases_4_2022', 'purchases_5_2021', 'purchases_5_2022', 'purchases_6_2021', 'purchases_6_2022', 'purchases_7_2021', 'purchases_7_2022', 'purchases_8_2021',\n", + " 'purchases_8_2022','purchases_9_2021', 'purchases_9_2022', 'purchase_date_min', 'purchase_date_max', 'nb_targets', 'time_to_open']" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "debb36df-3c2f-4cf7-83a9-ad6e4f6b0470", + "metadata": {}, + "outputs": [], + "source": [ + "scaler = StandardScaler()\n", + "\n", + "X_train_scaled_columns = scaler.fit_transform(X_train[numeric_features])\n", + "\n", + "X_train_scaled = X_train.copy() #\n", + "X_train_scaled[numeric_features] = X_train_scaled_columns" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "7eaa6160-20a0-4a78-ac38-0411e19707ed", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/mamba/lib/python3.11/site-packages/statsmodels/base/optimizer.py:18: FutureWarning: Keyword arguments have been passed to the optimizer that have no effect. The list of allowed keyword arguments for method newton is: tol, ridge_factor. The list of unsupported keyword arguments passed include: weights. After release 0.14, this will raise.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimization terminated successfully.\n", + " Current function value: 0.136180\n", + " Iterations 9\n", + " Logit Regression Results \n", + "==============================================================================\n", + "Dep. Variable: y_has_purchased No. Observations: 434278\n", + "Model: Logit Df Residuals: 434226\n", + "Method: MLE Df Model: 51\n", + "Date: Thu, 04 Apr 2024 Pseudo R-squ.: 0.2305\n", + "Time: 06:09:09 Log-Likelihood: -59140.\n", + "converged: True LL-Null: -76855.\n", + "Covariance Type: nonrobust LLR p-value: 0.000\n", + "===========================================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "-------------------------------------------------------------------------------------------\n", + "const -4.0679 1.65e+06 -2.46e-06 1.000 -3.24e+06 3.24e+06\n", + "nb_campaigns 0.0916 0.012 7.352 0.000 0.067 0.116\n", + "taux_ouverture_mail 0.0012 0.011 0.106 0.916 -0.021 0.023\n", + "prop_purchases_internet -0.1995 0.067 -2.972 0.003 -0.331 -0.068\n", + "nb_tickets 0.5956 0.193 3.091 0.002 0.218 0.973\n", + "nb_purchases 0.1598 1.71e+06 9.37e-08 1.000 -3.34e+06 3.34e+06\n", + "total_amount -0.1938 0.071 -2.724 0.006 -0.333 -0.054\n", + "nb_suppliers 0.0282 0.021 1.348 0.178 -0.013 0.069\n", + "time_to_open 0.2785 0.018 15.534 0.000 0.243 0.314\n", + "purchases_10_2021 0.0417 4.76e+04 8.76e-07 1.000 -9.34e+04 9.34e+04\n", + "purchases_10_2022 0.4578 2.72e+05 1.68e-06 1.000 -5.33e+05 5.33e+05\n", + "purchases_11_2021 0.0252 4.92e+04 5.12e-07 1.000 -9.65e+04 9.65e+04\n", + "purchases_12_2021 0.0221 6.3e+04 3.5e-07 1.000 -1.24e+05 1.24e+05\n", + "purchases_1_2022 0.0083 5.49e+04 1.52e-07 1.000 -1.08e+05 1.08e+05\n", + "purchases_2_2022 0.0462 7.59e+04 6.09e-07 1.000 -1.49e+05 1.49e+05\n", + "purchases_3_2022 0.0928 1.07e+05 8.67e-07 1.000 -2.1e+05 2.1e+05\n", + "purchases_4_2022 0.1446 1.65e+05 8.75e-07 1.000 -3.24e+05 3.24e+05\n", + "purchases_5_2021 -0.0427 4.84e+04 -8.83e-07 1.000 -9.48e+04 9.48e+04\n", + "purchases_5_2022 0.1412 1.67e+05 8.46e-07 1.000 -3.27e+05 3.27e+05\n", + "purchases_6_2021 -0.0252 5.55e+04 -4.54e-07 1.000 -1.09e+05 1.09e+05\n", + "purchases_6_2022 0.1246 1.84e+05 6.77e-07 1.000 -3.6e+05 3.6e+05\n", + "purchases_7_2021 -0.0252 5.55e+04 -4.55e-07 1.000 -1.09e+05 1.09e+05\n", + "purchases_7_2022 -0.0074 2.1e+05 -3.54e-08 1.000 -4.12e+05 4.12e+05\n", + "purchases_8_2021 0.0116 5.26e+04 2.21e-07 1.000 -1.03e+05 1.03e+05\n", + "purchases_8_2022 0.0554 2.4e+05 2.31e-07 1.000 -4.7e+05 4.7e+05\n", + "purchases_9_2021 -0.0320 5.47e+04 -5.85e-07 1.000 -1.07e+05 1.07e+05\n", + "purchases_9_2022 0.2349 2.2e+05 1.07e-06 1.000 -4.32e+05 4.32e+05\n", + "purchase_date_min 0.0781 0.025 3.092 0.002 0.029 0.128\n", + "purchase_date_max -0.5228 0.026 -20.021 0.000 -0.574 -0.472\n", + "nb_targets 0.7083 0.010 74.555 0.000 0.690 0.727\n", + "gender_female 0.2961 0.038 7.701 0.000 0.221 0.371\n", + "gender_male 0.0450 0.040 1.137 0.256 -0.033 0.123\n", + "achat_internet 0.1869 0.158 1.186 0.236 -0.122 0.496\n", + "categorie_age_0_10 -0.2713 1.65e+06 -1.64e-07 1.000 -3.24e+06 3.24e+06\n", + "categorie_age_10_20 -0.1238 1.65e+06 -7.48e-08 1.000 -3.24e+06 3.24e+06\n", + "categorie_age_20_30 -0.6322 1.65e+06 -3.82e-07 1.000 -3.24e+06 3.24e+06\n", + "categorie_age_30_40 -0.5004 1.65e+06 -3.02e-07 1.000 -3.24e+06 3.24e+06\n", + "categorie_age_40_50 -0.4020 1.65e+06 -2.43e-07 1.000 -3.24e+06 3.24e+06\n", + "categorie_age_50_60 -0.4101 1.65e+06 -2.48e-07 1.000 -3.24e+06 3.24e+06\n", + "categorie_age_60_70 -0.3232 1.65e+06 -1.95e-07 1.000 -3.24e+06 3.24e+06\n", + "categorie_age_70_80 -0.1635 1.65e+06 -9.88e-08 1.000 -3.24e+06 3.24e+06\n", + "categorie_age_plus_80 -0.4677 1.65e+06 -2.83e-07 1.000 -3.24e+06 3.24e+06\n", + "categorie_age_inconnue -0.7737 1.65e+06 -4.68e-07 1.000 -3.24e+06 3.24e+06\n", + "country_fr 0.7419 0.065 11.422 0.000 0.615 0.869\n", + "is_profession_known -0.5947 0.066 -9.074 0.000 -0.723 -0.466\n", + "is_zipcode_known 1.1374 0.027 41.609 0.000 1.084 1.191\n", + "opt_in -1.0658 0.030 -35.485 0.000 -1.125 -1.007\n", + "target_optin 0.5946 0.034 17.361 0.000 0.527 0.662\n", + "target_newsletter -1.0237 0.035 -29.411 0.000 -1.092 -0.955\n", + "target_scolaire 0.0428 0.036 1.188 0.235 -0.028 0.113\n", + "target_entreprise -0.2645 0.058 -4.589 0.000 -0.377 -0.152\n", + "target_famille 0.5035 0.035 14.548 0.000 0.436 0.571\n", + "target_jeune -0.6795 0.029 -23.590 0.000 -0.736 -0.623\n", + "target_abonne 0.0677 0.037 1.833 0.067 -0.005 0.140\n", + "===========================================================================================\n" + ] + } + ], + "source": [ + "model_logit = sm.Logit(y_train, X_train_scaled)\n", + "\n", + "result = model_logit.fit(weights=weights)\n", + "\n", + "print(result.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75dc92c7-cc1e-40f1-bc74-0b04043b7e44", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}