added summary for logit with penalty

This commit is contained in:
Thomas PIQUE 2024-03-21 08:18:31 +00:00
parent a0256c551b
commit a85036ad23

View File

@ -65,7 +65,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 3,
"id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
"metadata": {},
"outputs": [],
@ -115,9 +115,9 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
"/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
"/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
]
}
@ -228,7 +228,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "6224fd31-c190-4168-b395-e0bf5806d79d",
"metadata": {},
"outputs": [
@ -238,7 +238,7 @@
"{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -254,7 +254,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 10,
"id": "4680f202-979e-483f-89b8-9df877203bcf",
"metadata": {},
"outputs": [
@ -265,7 +265,7 @@
" 0.54812838])"
]
},
"execution_count": 58,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -282,7 +282,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 11,
"id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
"metadata": {},
"outputs": [
@ -311,7 +311,7 @@
},
{
"cell_type": "code",
"execution_count": 258,
"execution_count": 12,
"id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
"metadata": {},
"outputs": [
@ -650,7 +650,7 @@
"[354365 rows x 17 columns]"
]
},
"execution_count": 258,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@ -662,7 +662,7 @@
},
{
"cell_type": "code",
"execution_count": 259,
"execution_count": 13,
"id": "648fb542-0186-493d-b274-be2c26a11967",
"metadata": {},
"outputs": [],
@ -677,7 +677,7 @@
},
{
"cell_type": "code",
"execution_count": 260,
"execution_count": 14,
"id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
"metadata": {},
"outputs": [
@ -1016,7 +1016,7 @@
"[354365 rows x 17 columns]"
]
},
"execution_count": 260,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -1510,12 +1510,14 @@
"\n",
"- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
"- ajouter un intercept\n",
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO "
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n",
"\n",
"#### A recopier dans la pipeline -> section 2 bis"
]
},
{
"cell_type": "code",
"execution_count": 266,
"execution_count": 15,
"id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
"metadata": {},
"outputs": [
@ -1817,7 +1819,7 @@
"[354365 rows x 15 columns]"
]
},
"execution_count": 266,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@ -1831,7 +1833,7 @@
},
{
"cell_type": "code",
"execution_count": 267,
"execution_count": 16,
"id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
"metadata": {},
"outputs": [
@ -1847,8 +1849,8 @@
"Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n",
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 10:07:29 Log-Likelihood: -83135.\n",
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 07:57:46 Log-Likelihood: -83135.\n",
"converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n",
@ -1887,7 +1889,7 @@
},
{
"cell_type": "code",
"execution_count": 268,
"execution_count": 17,
"id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
"metadata": {},
"outputs": [],
@ -1908,7 +1910,7 @@
},
{
"cell_type": "code",
"execution_count": 269,
"execution_count": 18,
"id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
"metadata": {},
"outputs": [
@ -2210,7 +2212,7 @@
"[354365 rows x 15 columns]"
]
},
"execution_count": 269,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@ -2221,7 +2223,7 @@
},
{
"cell_type": "code",
"execution_count": 289,
"execution_count": 19,
"id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
"metadata": {},
"outputs": [
@ -2237,8 +2239,8 @@
"Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n",
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 10:26:14 Log-Likelihood: -83135.\n",
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 07:58:13 Log-Likelihood: -83135.\n",
"converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n",
@ -2276,12 +2278,88 @@
"print(result.summary())"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Optimization terminated successfully (Exit mode 0)\n",
" Current function value: 0.2349337504783076\n",
" Iterations: 268\n",
" Function evaluations: 269\n",
" Gradient evaluations: 268\n",
"const 0.000000e+00\n",
"nb_tickets 6.284219e-02\n",
"nb_purchases 5.681771e-04\n",
"total_amount 2.423593e-04\n",
"nb_suppliers 1.972437e-68\n",
"vente_internet_max 0.000000e+00\n",
"purchase_date_min 0.000000e+00\n",
"purchase_date_max 0.000000e+00\n",
"nb_tickets_internet 4.072862e-113\n",
"is_email_true 3.163634e-17\n",
"opt_in 0.000000e+00\n",
"gender_female 9.730017e-184\n",
"gender_male 8.018545e-235\n",
"nb_campaigns 9.847753e-206\n",
"nb_campaigns_opened 8.789998e-176\n",
"dtype: float64\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n",
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 08:15:18 Log-Likelihood: -83136.\n",
"converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -3.4386 0.087 -39.483 0.000 -3.609 -3.268\n",
"nb_tickets -0.0198 0.011 -1.860 0.063 -0.041 0.001\n",
"nb_purchases -0.0471 0.014 -3.446 0.001 -0.074 -0.020\n",
"total_amount 0.0724 0.020 3.670 0.000 0.034 0.111\n",
"nb_suppliers 0.1676 0.010 17.482 0.000 0.149 0.186\n",
"vente_internet_max -0.8729 0.011 -82.724 0.000 -0.894 -0.852\n",
"purchase_date_min 0.5852 0.015 39.762 0.000 0.556 0.614\n",
"purchase_date_max -1.4162 0.016 -89.437 0.000 -1.447 -1.385\n",
"nb_tickets_internet 0.2884 0.013 22.603 0.000 0.263 0.313\n",
"is_email_true 0.7152 0.085 8.440 0.000 0.549 0.881\n",
"opt_in -1.9927 0.019 -107.165 0.000 -2.029 -1.956\n",
"gender_female 0.6880 0.024 28.907 0.000 0.641 0.735\n",
"gender_male 0.7914 0.024 32.720 0.000 0.744 0.839\n",
"nb_campaigns 0.2845 0.009 30.607 0.000 0.266 0.303\n",
"nb_campaigns_opened 0.2061 0.007 28.267 0.000 0.192 0.220\n",
"=======================================================================================\n"
]
}
],
"source": [
"# 2.bis on fait de même pour un modèle logit avec pénalité \n",
"# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
"\n",
"# sans spécification, le alpha optimal est déterminé par cross validation\n",
"# remplacer alpha=10 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
"# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
"\n",
"result = model_logit.fit_regularized(method='l1', alpha = 10)\n",
"\n",
"print(result.pvalues)\n",
"print(result.summary())"
]
},
{
"cell_type": "markdown",
"id": "36c5e770-72b3-4482-ad61-45b511a11f06",
"metadata": {},
"source": [
"## graphique LASSO - quelles variables sont impotantes dans le modèle ? "
"## graphique LASSO - quelles variables sont importantes dans le modèle ? "
]
},
{