added summary for logit with penalty
This commit is contained in:
parent
a0256c551b
commit
a85036ad23
|
@ -65,7 +65,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 3,
|
||||
"id": "2f0d08c9-5b26-4eff-9c89-4a46f427dbf7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -115,9 +115,9 @@
|
|||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/tmp/ipykernel_570/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
"/tmp/ipykernel_426/3642896088.py:7: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
||||
"/tmp/ipykernel_570/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
"/tmp/ipykernel_426/3642896088.py:11: DtypeWarning: Columns (38) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
||||
" dataset_test = pd.read_csv(file_in, sep=\",\")\n"
|
||||
]
|
||||
}
|
||||
|
@ -228,7 +228,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 9,
|
||||
"id": "6224fd31-c190-4168-b395-e0bf5806d79d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -238,7 +238,7 @@
|
|||
"{0.0: 0.5481283836040216, 1.0: 5.694439980716696}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -254,7 +254,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"execution_count": 10,
|
||||
"id": "4680f202-979e-483f-89b8-9df877203bcf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -265,7 +265,7 @@
|
|||
" 0.54812838])"
|
||||
]
|
||||
},
|
||||
"execution_count": 58,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -282,7 +282,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"execution_count": 11,
|
||||
"id": "5f747be4-e70b-491c-8f0a-46cb278a2dee",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -311,7 +311,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 258,
|
||||
"execution_count": 12,
|
||||
"id": "ab25a901-28da-4504-a7d1-bf41fa5068bc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -650,7 +650,7 @@
|
|||
"[354365 rows x 17 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 258,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -662,7 +662,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 259,
|
||||
"execution_count": 13,
|
||||
"id": "648fb542-0186-493d-b274-be2c26a11967",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -677,7 +677,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 260,
|
||||
"execution_count": 14,
|
||||
"id": "978b9ebc-aa97-41d7-a48f-d1f79c1ed482",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -1016,7 +1016,7 @@
|
|||
"[354365 rows x 17 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 260,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -1510,12 +1510,14 @@
|
|||
"\n",
|
||||
"- variables à retirer : fidelity (valeurs trop grandes dont l'exp -> +inf, autre problème : st basé sur des infos qu'on a pas sur la période étudiée mais slt sur période d'évaluation), time between purchase (revoir sa construction), gender_other (colinéarité avec les autres var de genre)\n",
|
||||
"- ajouter un intercept\n",
|
||||
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO "
|
||||
"- pas besoin de standardiser pour le moment, mais à faire quand on passera au modèle LASSO\n",
|
||||
"\n",
|
||||
"#### A recopier dans la pipeline -> section 2 bis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 266,
|
||||
"execution_count": 15,
|
||||
"id": "e6c8ccc7-6ab8-4e3c-af28-e71d17c07bcb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -1817,7 +1819,7 @@
|
|||
"[354365 rows x 15 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 266,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -1831,7 +1833,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 267,
|
||||
"execution_count": 16,
|
||||
"id": "0e968aa1-fbec-47db-b570-4730ef7eebf2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -1847,8 +1849,8 @@
|
|||
"Dep. Variable: y No. Observations: 354365\n",
|
||||
"Model: Logit Df Residuals: 354350\n",
|
||||
"Method: MLE Df Model: 14\n",
|
||||
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||
"Time: 10:07:29 Log-Likelihood: -83135.\n",
|
||||
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||
"Time: 07:57:46 Log-Likelihood: -83135.\n",
|
||||
"converged: True LL-Null: -1.0540e+05\n",
|
||||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||||
"=======================================================================================\n",
|
||||
|
@ -1887,7 +1889,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 268,
|
||||
"execution_count": 17,
|
||||
"id": "2475f2fe-3d1f-4845-9ede-0416dac83271",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -1908,7 +1910,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 269,
|
||||
"execution_count": 18,
|
||||
"id": "696fcc04-e5df-45dc-a1b9-57c30d4d671d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -2210,7 +2212,7 @@
|
|||
"[354365 rows x 15 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 269,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -2221,7 +2223,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 289,
|
||||
"execution_count": 19,
|
||||
"id": "54421677-640f-4f37-9a0d-d9a2cc3572b0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -2237,8 +2239,8 @@
|
|||
"Dep. Variable: y No. Observations: 354365\n",
|
||||
"Model: Logit Df Residuals: 354350\n",
|
||||
"Method: MLE Df Model: 14\n",
|
||||
"Date: Fri, 15 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||
"Time: 10:26:14 Log-Likelihood: -83135.\n",
|
||||
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||
"Time: 07:58:13 Log-Likelihood: -83135.\n",
|
||||
"converged: True LL-Null: -1.0540e+05\n",
|
||||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||||
"=======================================================================================\n",
|
||||
|
@ -2276,12 +2278,88 @@
|
|||
"print(result.summary())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Optimization terminated successfully (Exit mode 0)\n",
|
||||
" Current function value: 0.2349337504783076\n",
|
||||
" Iterations: 268\n",
|
||||
" Function evaluations: 269\n",
|
||||
" Gradient evaluations: 268\n",
|
||||
"const 0.000000e+00\n",
|
||||
"nb_tickets 6.284219e-02\n",
|
||||
"nb_purchases 5.681771e-04\n",
|
||||
"total_amount 2.423593e-04\n",
|
||||
"nb_suppliers 1.972437e-68\n",
|
||||
"vente_internet_max 0.000000e+00\n",
|
||||
"purchase_date_min 0.000000e+00\n",
|
||||
"purchase_date_max 0.000000e+00\n",
|
||||
"nb_tickets_internet 4.072862e-113\n",
|
||||
"is_email_true 3.163634e-17\n",
|
||||
"opt_in 0.000000e+00\n",
|
||||
"gender_female 9.730017e-184\n",
|
||||
"gender_male 8.018545e-235\n",
|
||||
"nb_campaigns 9.847753e-206\n",
|
||||
"nb_campaigns_opened 8.789998e-176\n",
|
||||
"dtype: float64\n",
|
||||
" Logit Regression Results \n",
|
||||
"==============================================================================\n",
|
||||
"Dep. Variable: y No. Observations: 354365\n",
|
||||
"Model: Logit Df Residuals: 354350\n",
|
||||
"Method: MLE Df Model: 14\n",
|
||||
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
|
||||
"Time: 08:15:18 Log-Likelihood: -83136.\n",
|
||||
"converged: True LL-Null: -1.0540e+05\n",
|
||||
"Covariance Type: nonrobust LLR p-value: 0.000\n",
|
||||
"=======================================================================================\n",
|
||||
" coef std err z P>|z| [0.025 0.975]\n",
|
||||
"---------------------------------------------------------------------------------------\n",
|
||||
"const -3.4386 0.087 -39.483 0.000 -3.609 -3.268\n",
|
||||
"nb_tickets -0.0198 0.011 -1.860 0.063 -0.041 0.001\n",
|
||||
"nb_purchases -0.0471 0.014 -3.446 0.001 -0.074 -0.020\n",
|
||||
"total_amount 0.0724 0.020 3.670 0.000 0.034 0.111\n",
|
||||
"nb_suppliers 0.1676 0.010 17.482 0.000 0.149 0.186\n",
|
||||
"vente_internet_max -0.8729 0.011 -82.724 0.000 -0.894 -0.852\n",
|
||||
"purchase_date_min 0.5852 0.015 39.762 0.000 0.556 0.614\n",
|
||||
"purchase_date_max -1.4162 0.016 -89.437 0.000 -1.447 -1.385\n",
|
||||
"nb_tickets_internet 0.2884 0.013 22.603 0.000 0.263 0.313\n",
|
||||
"is_email_true 0.7152 0.085 8.440 0.000 0.549 0.881\n",
|
||||
"opt_in -1.9927 0.019 -107.165 0.000 -2.029 -1.956\n",
|
||||
"gender_female 0.6880 0.024 28.907 0.000 0.641 0.735\n",
|
||||
"gender_male 0.7914 0.024 32.720 0.000 0.744 0.839\n",
|
||||
"nb_campaigns 0.2845 0.009 30.607 0.000 0.266 0.303\n",
|
||||
"nb_campaigns_opened 0.2061 0.007 28.267 0.000 0.192 0.220\n",
|
||||
"=======================================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# 2.bis on fait de même pour un modèle logit avec pénalité \n",
|
||||
"# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
|
||||
"\n",
|
||||
"# sans spécification, le alpha optimal est déterminé par cross validation\n",
|
||||
"# remplacer alpha=10 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
|
||||
"# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
|
||||
"\n",
|
||||
"result = model_logit.fit_regularized(method='l1', alpha = 10)\n",
|
||||
"\n",
|
||||
"print(result.pvalues)\n",
|
||||
"print(result.summary())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36c5e770-72b3-4482-ad61-45b511a11f06",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## graphique LASSO - quelles variables sont impotantes dans le modèle ? "
|
||||
"## graphique LASSO - quelles variables sont importantes dans le modèle ? "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue
Block a user