From a85036ad2331065a9ffdfa801f0f74071990d2f4 Mon Sep 17 00:00:00 2001 From: tpique-ensae Date: Thu, 21 Mar 2024 08:18:31 +0000 Subject: [PATCH] added summary for logit with penalty --- .../2_bis_logit_baseline_statsmodels.ipynb | 130 ++++++++++++++---- 1 file changed, 104 insertions(+), 26 deletions(-) diff --git a/Spectacle/2_bis_logit_baseline_statsmodels.ipynb b/Spectacle/2_bis_logit_baseline_statsmodels.ipynb index 93776c0..a240fee 100644 --- a/Spectacle/2_bis_logit_baseline_statsmodels.ipynb +++ b/Spectacle/2_bis_logit_baseline_statsmodels.ipynb @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7", "metadata": {}, "outputs": [], @@ -115,9 +115,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", - "/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n" ] } @@ -228,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "6224fd31-c190-4168-b395-e0bf5806d79d", "metadata": {}, "outputs": [ @@ -238,7 +238,7 @@ "{0.0: 0.5481283836040216, 1.0: 5.694439980716696}" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -254,7 +254,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 10, "id": "4680f202-979e-483f-89b8-9df877203bcf", "metadata": {}, "outputs": [ @@ -265,7 +265,7 @@ " 0.54812838])" ] }, - "execution_count": 58, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -282,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 11, "id": "5f747be4-e70b-491c-8f0a-46cb278a2dee", "metadata": {}, "outputs": [ @@ -311,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 258, + "execution_count": 12, "id": "ab25a901-28da-4504-a7d1-bf41fa5068bc", "metadata": {}, "outputs": [ @@ -650,7 +650,7 @@ "[354365 rows x 17 columns]" ] }, - "execution_count": 258, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -662,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 259, + "execution_count": 13, "id": "648fb542-0186-493d-b274-be2c26a11967", "metadata": {}, "outputs": [], @@ -677,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 260, + "execution_count": 14, "id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482", "metadata": {}, "outputs": [ @@ -1016,7 +1016,7 @@ "[354365 rows x 17 columns]" ] }, - "execution_count": 260, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1510,12 +1510,14 @@ "\n", "- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n", "- ajouter un intercept\n", - "- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO " + "- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n", + "\n", + "#### A recopier dans la pipeline -> section 2 bis" ] }, { "cell_type": "code", - "execution_count": 266, + "execution_count": 15, "id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb", "metadata": {}, "outputs": [ @@ -1817,7 +1819,7 @@ "[354365 rows x 15 columns]" ] }, - "execution_count": 266, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1831,7 +1833,7 @@ }, { "cell_type": "code", - "execution_count": 267, + "execution_count": 16, "id": "0e968aa1-fbec-47db-b570-4730ef7eebf2", "metadata": {}, "outputs": [ @@ -1847,8 +1849,8 @@ "Dep. Variable: y No. Observations: 354365\n", "Model: Logit Df Residuals: 354350\n", "Method: MLE Df Model: 14\n", - "Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n", - "Time: 10:07:29 Log-Likelihood: -83135.\n", + "Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n", + "Time: 07:57:46 Log-Likelihood: -83135.\n", "converged: True LL-Null: -1.0540e+05\n", "Covariance Type: nonrobust LLR p-value: 0.000\n", "=======================================================================================\n", @@ -1887,7 +1889,7 @@ }, { "cell_type": "code", - "execution_count": 268, + "execution_count": 17, "id": "2475f2fe-3d1f-4845-9ede-0416dac83271", "metadata": {}, "outputs": [], @@ -1908,7 +1910,7 @@ }, { "cell_type": "code", - "execution_count": 269, + "execution_count": 18, "id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d", "metadata": {}, "outputs": [ @@ -2210,7 +2212,7 @@ "[354365 rows x 15 columns]" ] }, - "execution_count": 269, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -2221,7 +2223,7 @@ }, { "cell_type": "code", - "execution_count": 289, + "execution_count": 19, "id": "54421677-640f-4f37-9a0d-d9a2cc3572b0", "metadata": {}, "outputs": [ @@ -2237,8 +2239,8 @@ "Dep. Variable: y No. Observations: 354365\n", "Model: Logit Df Residuals: 354350\n", "Method: MLE Df Model: 14\n", - "Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n", - "Time: 10:26:14 Log-Likelihood: -83135.\n", + "Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n", + "Time: 07:58:13 Log-Likelihood: -83135.\n", "converged: True LL-Null: -1.0540e+05\n", "Covariance Type: nonrobust LLR p-value: 0.000\n", "=======================================================================================\n", @@ -2276,12 +2278,88 @@ "print(result.summary())" ] }, + { + "cell_type": "code", + "execution_count": 29, + "id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimization terminated successfully (Exit mode 0)\n", + " Current function value: 0.2349337504783076\n", + " Iterations: 268\n", + " Function evaluations: 269\n", + " Gradient evaluations: 268\n", + "const 0.000000e+00\n", + "nb_tickets 6.284219e-02\n", + "nb_purchases 5.681771e-04\n", + "total_amount 2.423593e-04\n", + "nb_suppliers 1.972437e-68\n", + "vente_internet_max 0.000000e+00\n", + "purchase_date_min 0.000000e+00\n", + "purchase_date_max 0.000000e+00\n", + "nb_tickets_internet 4.072862e-113\n", + "is_email_true 3.163634e-17\n", + "opt_in 0.000000e+00\n", + "gender_female 9.730017e-184\n", + "gender_male 8.018545e-235\n", + "nb_campaigns 9.847753e-206\n", + "nb_campaigns_opened 8.789998e-176\n", + "dtype: float64\n", + " Logit Regression Results \n", + "==============================================================================\n", + "Dep. Variable: y No. Observations: 354365\n", + "Model: Logit Df Residuals: 354350\n", + "Method: MLE Df Model: 14\n", + "Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n", + "Time: 08:15:18 Log-Likelihood: -83136.\n", + "converged: True LL-Null: -1.0540e+05\n", + "Covariance Type: nonrobust LLR p-value: 0.000\n", + "=======================================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "---------------------------------------------------------------------------------------\n", + "const -3.4386 0.087 -39.483 0.000 -3.609 -3.268\n", + "nb_tickets -0.0198 0.011 -1.860 0.063 -0.041 0.001\n", + "nb_purchases -0.0471 0.014 -3.446 0.001 -0.074 -0.020\n", + "total_amount 0.0724 0.020 3.670 0.000 0.034 0.111\n", + "nb_suppliers 0.1676 0.010 17.482 0.000 0.149 0.186\n", + "vente_internet_max -0.8729 0.011 -82.724 0.000 -0.894 -0.852\n", + "purchase_date_min 0.5852 0.015 39.762 0.000 0.556 0.614\n", + "purchase_date_max -1.4162 0.016 -89.437 0.000 -1.447 -1.385\n", + "nb_tickets_internet 0.2884 0.013 22.603 0.000 0.263 0.313\n", + "is_email_true 0.7152 0.085 8.440 0.000 0.549 0.881\n", + "opt_in -1.9927 0.019 -107.165 0.000 -2.029 -1.956\n", + "gender_female 0.6880 0.024 28.907 0.000 0.641 0.735\n", + "gender_male 0.7914 0.024 32.720 0.000 0.744 0.839\n", + "nb_campaigns 0.2845 0.009 30.607 0.000 0.266 0.303\n", + "nb_campaigns_opened 0.2061 0.007 28.267 0.000 0.192 0.220\n", + "=======================================================================================\n" + ] + } + ], + "source": [ + "# 2.bis on fait de même pour un modèle logit avec pénalité \n", + "# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n", + "\n", + "# sans spécification, le alpha optimal est déterminé par cross validation\n", + "# remplacer alpha=10 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n", + "# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n", + "\n", + "result = model_logit.fit_regularized(method='l1', alpha = 10)\n", + "\n", + "print(result.pvalues)\n", + "print(result.summary())" + ] + }, { "cell_type": "markdown", "id": "36c5e770-72b3-4482-ad61-45b511a11f06", "metadata": {}, "source": [ - "## graphique LASSO - quelles variables sont impotantes dans le modèle ? " + "## graphique LASSO - quelles variables sont importantes dans le modèle ? " ] }, {