added summary with weights

This commit is contained in:
Thomas PIQUE 2024-03-21 13:21:40 +00:00
parent a85036ad23
commit 33df2fda4f

View File

@ -2280,7 +2280,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 48,
"id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8",
"metadata": {},
"outputs": [
@ -2289,53 +2289,53 @@
"output_type": "stream",
"text": [
"Optimization terminated successfully (Exit mode 0)\n",
" Current function value: 0.2349337504783076\n",
" Iterations: 268\n",
" Function evaluations: 269\n",
" Gradient evaluations: 268\n",
" Current function value: 0.23562928627877766\n",
" Iterations: 240\n",
" Function evaluations: 243\n",
" Gradient evaluations: 240\n",
"const 0.000000e+00\n",
"nb_tickets 6.284219e-02\n",
"nb_purchases 5.681771e-04\n",
"total_amount 2.423593e-04\n",
"nb_suppliers 1.972437e-68\n",
"nb_tickets 2.477006e-01\n",
"nb_purchases 1.636902e-03\n",
"total_amount 8.839088e-04\n",
"nb_suppliers 1.906550e-65\n",
"vente_internet_max 0.000000e+00\n",
"purchase_date_min 0.000000e+00\n",
"purchase_date_max 0.000000e+00\n",
"nb_tickets_internet 4.072862e-113\n",
"is_email_true 3.163634e-17\n",
"nb_tickets_internet 7.232680e-112\n",
"is_email_true 8.202187e-08\n",
"opt_in 0.000000e+00\n",
"gender_female 9.730017e-184\n",
"gender_male 8.018545e-235\n",
"nb_campaigns 9.847753e-206\n",
"nb_campaigns_opened 8.789998e-176\n",
"gender_female 1.624424e-170\n",
"gender_male 4.961315e-220\n",
"nb_campaigns 6.276733e-205\n",
"nb_campaigns_opened 2.228531e-176\n",
"dtype: float64\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: Logit Df Residuals: 354350\n",
"Method: MLE Df Model: 14\n",
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n",
"Time: 08:15:18 Log-Likelihood: -83136.\n",
"Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2111\n",
"Time: 10:45:37 Log-Likelihood: -83152.\n",
"converged: True LL-Null: -1.0540e+05\n",
"Covariance Type: nonrobust LLR p-value: 0.000\n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -3.4386 0.087 -39.483 0.000 -3.609 -3.268\n",
"nb_tickets -0.0198 0.011 -1.860 0.063 -0.041 0.001\n",
"nb_purchases -0.0471 0.014 -3.446 0.001 -0.074 -0.020\n",
"total_amount 0.0724 0.020 3.670 0.000 0.034 0.111\n",
"nb_suppliers 0.1676 0.010 17.482 0.000 0.149 0.186\n",
"vente_internet_max -0.8729 0.011 -82.724 0.000 -0.894 -0.852\n",
"purchase_date_min 0.5852 0.015 39.762 0.000 0.556 0.614\n",
"purchase_date_max -1.4162 0.016 -89.437 0.000 -1.447 -1.385\n",
"nb_tickets_internet 0.2884 0.013 22.603 0.000 0.263 0.313\n",
"is_email_true 0.7152 0.085 8.440 0.000 0.549 0.881\n",
"opt_in -1.9927 0.019 -107.165 0.000 -2.029 -1.956\n",
"gender_female 0.6880 0.024 28.907 0.000 0.641 0.735\n",
"gender_male 0.7914 0.024 32.720 0.000 0.744 0.839\n",
"nb_campaigns 0.2845 0.009 30.607 0.000 0.266 0.303\n",
"nb_campaigns_opened 0.2061 0.007 28.267 0.000 0.192 0.220\n",
"const -3.1162 0.081 -38.383 0.000 -3.275 -2.957\n",
"nb_tickets -0.0136 0.012 -1.156 0.248 -0.037 0.009\n",
"nb_purchases -0.0385 0.012 -3.149 0.002 -0.063 -0.015\n",
"total_amount 0.0588 0.018 3.325 0.001 0.024 0.094\n",
"nb_suppliers 0.1638 0.010 17.085 0.000 0.145 0.183\n",
"vente_internet_max -0.8651 0.011 -82.182 0.000 -0.886 -0.844\n",
"purchase_date_min 0.5790 0.015 39.391 0.000 0.550 0.608\n",
"purchase_date_max -1.4088 0.016 -89.101 0.000 -1.440 -1.378\n",
"nb_tickets_internet 0.2857 0.013 22.475 0.000 0.261 0.311\n",
"is_email_true 0.4224 0.079 5.363 0.000 0.268 0.577\n",
"opt_in -1.9818 0.019 -106.856 0.000 -2.018 -1.945\n",
"gender_female 0.6553 0.024 27.835 0.000 0.609 0.701\n",
"gender_male 0.7578 0.024 31.663 0.000 0.711 0.805\n",
"nb_campaigns 0.2835 0.009 30.547 0.000 0.265 0.302\n",
"nb_campaigns_opened 0.2061 0.007 28.315 0.000 0.192 0.220\n",
"=======================================================================================\n"
]
}
@ -2345,15 +2345,153 @@
"# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n",
"\n",
"# sans spécification, le alpha optimal est déterminé par cross validation\n",
"# remplacer alpha=10 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
"# remplacer alpha=32 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n",
"# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n",
"\n",
"result = model_logit.fit_regularized(method='l1', alpha = 10)\n",
"result = model_logit.fit_regularized(method='l1', alpha = 32)\n",
"\n",
"print(result.pvalues)\n",
"print(result.summary())"
]
},
{
"cell_type": "markdown",
"id": "8c3dec50-7b9d-40f6-83b6-6cae26962cf8",
"metadata": {},
"source": [
"### Other method : take into account the weigths ! Pb : with this method, no penalty allowed"
]
},
{
"cell_type": "code",
"execution_count": 247,
"id": "2e3ca381-54e3-445b-bb37-d7ce953cb856",
"metadata": {},
"outputs": [],
"source": [
"# define a function to generate summaries of logit model\n",
"\n",
"def model_logit(X, y, weight_dict, add_constant=False) :\n",
" # Generate sample weights based on class weights computed earlier\n",
" sample_weights = np.array([weight_dict[class_] for class_ in y])\n",
"\n",
" if add_constant :\n",
" X_const = sm.add_constant(X)\n",
" else :\n",
" X_const = X\n",
" \n",
" # Use GLM from statsmodels with Binomial family for logistic regression\n",
" model = sm.GLM(y, X_const, family=sm.families.Binomial(), freq_weights=sample_weights)\n",
" \n",
" # fit without penalty\n",
" result = model.fit()\n",
"\n",
" result_summary = result.summary()\n",
" \n",
" return result_summary"
]
},
{
"cell_type": "code",
"execution_count": 248,
"id": "4cd424a0-7c55-47ff-840e-1354e8dcf863",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: GLM Df Residuals: 354350\n",
"Model Family: Binomial Df Model: 14\n",
"Link Function: Logit Scale: 1.0000\n",
"Method: IRLS Log-Likelihood: -1.8693e+05\n",
"Date: Thu, 21 Mar 2024 Deviance: 3.7387e+05\n",
"Time: 13:19:33 Pearson chi2: 1.97e+16\n",
"No. Iterations: 100 Pseudo R-squ. (CS): 0.2820\n",
"Covariance Type: nonrobust \n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -1.3943 0.062 -22.456 0.000 -1.516 -1.273\n",
"nb_tickets -0.3312 0.016 -20.967 0.000 -0.362 -0.300\n",
"nb_purchases 0.9258 0.098 9.491 0.000 0.735 1.117\n",
"total_amount 0.8922 0.042 21.393 0.000 0.810 0.974\n",
"nb_suppliers 0.2238 0.007 32.137 0.000 0.210 0.237\n",
"vente_internet_max -0.7453 0.007 -100.473 0.000 -0.760 -0.731\n",
"purchase_date_min 0.7123 0.015 46.063 0.000 0.682 0.743\n",
"purchase_date_max -1.3328 0.017 -79.297 0.000 -1.366 -1.300\n",
"nb_tickets_internet 0.1784 0.011 16.366 0.000 0.157 0.200\n",
"is_email_true 0.8635 0.061 14.086 0.000 0.743 0.984\n",
"opt_in -1.7487 0.010 -174.737 0.000 -1.768 -1.729\n",
"gender_female 0.8084 0.013 60.803 0.000 0.782 0.835\n",
"gender_male 0.8731 0.014 64.332 0.000 0.846 0.900\n",
"nb_campaigns 0.1751 0.006 31.101 0.000 0.164 0.186\n",
"nb_campaigns_opened 0.2962 0.005 54.145 0.000 0.285 0.307\n",
"=======================================================================================\n"
]
}
],
"source": [
"# with the function\n",
"\n",
"# 1. logit with weights\n",
"results_logit_weight = model_logit(X,y,weight_dict=weight_dict)\n",
"print(results_logit_weight)"
]
},
{
"cell_type": "code",
"execution_count": 252,
"id": "84dd6242-a9c3-4dee-a58b-abc5f1c6f8fa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: y No. Observations: 354365\n",
"Model: GLM Df Residuals: 354350\n",
"Model Family: Binomial Df Model: 14\n",
"Link Function: Logit Scale: 1.0000\n",
"Method: IRLS Log-Likelihood: -83141.\n",
"Date: Thu, 21 Mar 2024 Deviance: 1.6628e+05\n",
"Time: 13:20:06 Pearson chi2: 4.52e+15\n",
"No. Iterations: 8 Pseudo R-squ. (CS): 0.1180\n",
"Covariance Type: nonrobust \n",
"=======================================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"---------------------------------------------------------------------------------------\n",
"const -3.6025 0.091 -39.755 0.000 -3.780 -3.425\n",
"nb_tickets -0.0230 0.010 -2.191 0.028 -0.044 -0.002\n",
"nb_purchases -0.0519 0.014 -3.609 0.000 -0.080 -0.024\n",
"total_amount 0.0799 0.021 3.841 0.000 0.039 0.121\n",
"nb_suppliers 0.1694 0.010 17.662 0.000 0.151 0.188\n",
"vente_internet_max -0.8764 0.011 -82.965 0.000 -0.897 -0.856\n",
"purchase_date_min 0.5881 0.015 39.936 0.000 0.559 0.617\n",
"purchase_date_max -1.4197 0.016 -89.592 0.000 -1.451 -1.389\n",
"nb_tickets_internet 0.2895 0.013 22.652 0.000 0.264 0.315\n",
"is_email_true 0.8651 0.088 9.797 0.000 0.692 1.038\n",
"opt_in -1.9976 0.019 -107.305 0.000 -2.034 -1.961\n",
"gender_female 0.7032 0.024 29.395 0.000 0.656 0.750\n",
"gender_male 0.8071 0.024 33.201 0.000 0.759 0.855\n",
"nb_campaigns 0.2850 0.009 30.633 0.000 0.267 0.303\n",
"nb_campaigns_opened 0.2061 0.007 28.245 0.000 0.192 0.220\n",
"=======================================================================================\n"
]
}
],
"source": [
"# 2. logit without weights\n",
"\n",
"results_logit = model_logit(X.drop(\"const\", axis=1),y,weight_dict={0:1, 1:1}, add_constant=True)\n",
"print(results_logit)"
]
},
{
"cell_type": "markdown",
"id": "36c5e770-72b3-4482-ad61-45b511a11f06",