diff --git a/Spectacle/2_bis_logit_baseline_statsmodels.ipynb b/Spectacle/2_bis_logit_baseline_statsmodels.ipynb index a240fee..b7d337e 100644 --- a/Spectacle/2_bis_logit_baseline_statsmodels.ipynb +++ b/Spectacle/2_bis_logit_baseline_statsmodels.ipynb @@ -2280,7 +2280,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 48, "id": "13cc3362-7bb2-46fa-8bd8-e5a8e53260b8", "metadata": {}, "outputs": [ @@ -2289,53 +2289,53 @@ "output_type": "stream", "text": [ "Optimization terminated successfully (Exit mode 0)\n", - " Current function value: 0.2349337504783076\n", - " Iterations: 268\n", - " Function evaluations: 269\n", - " Gradient evaluations: 268\n", + " Current function value: 0.23562928627877766\n", + " Iterations: 240\n", + " Function evaluations: 243\n", + " Gradient evaluations: 240\n", "const 0.000000e+00\n", - "nb_tickets 6.284219e-02\n", - "nb_purchases 5.681771e-04\n", - "total_amount 2.423593e-04\n", - "nb_suppliers 1.972437e-68\n", + "nb_tickets 2.477006e-01\n", + "nb_purchases 1.636902e-03\n", + "total_amount 8.839088e-04\n", + "nb_suppliers 1.906550e-65\n", "vente_internet_max 0.000000e+00\n", "purchase_date_min 0.000000e+00\n", "purchase_date_max 0.000000e+00\n", - "nb_tickets_internet 4.072862e-113\n", - "is_email_true 3.163634e-17\n", + "nb_tickets_internet 7.232680e-112\n", + "is_email_true 8.202187e-08\n", "opt_in 0.000000e+00\n", - "gender_female 9.730017e-184\n", - "gender_male 8.018545e-235\n", - "nb_campaigns 9.847753e-206\n", - "nb_campaigns_opened 8.789998e-176\n", + "gender_female 1.624424e-170\n", + "gender_male 4.961315e-220\n", + "nb_campaigns 6.276733e-205\n", + "nb_campaigns_opened 2.228531e-176\n", "dtype: float64\n", " Logit Regression Results \n", "==============================================================================\n", "Dep. Variable: y No. Observations: 354365\n", "Model: Logit Df Residuals: 354350\n", "Method: MLE Df Model: 14\n", - "Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2112\n", - "Time: 08:15:18 Log-Likelihood: -83136.\n", + "Date: Thu, 21 Mar 2024 Pseudo R-squ.: 0.2111\n", + "Time: 10:45:37 Log-Likelihood: -83152.\n", "converged: True LL-Null: -1.0540e+05\n", "Covariance Type: nonrobust LLR p-value: 0.000\n", "=======================================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "---------------------------------------------------------------------------------------\n", - "const -3.4386 0.087 -39.483 0.000 -3.609 -3.268\n", - "nb_tickets -0.0198 0.011 -1.860 0.063 -0.041 0.001\n", - "nb_purchases -0.0471 0.014 -3.446 0.001 -0.074 -0.020\n", - "total_amount 0.0724 0.020 3.670 0.000 0.034 0.111\n", - "nb_suppliers 0.1676 0.010 17.482 0.000 0.149 0.186\n", - "vente_internet_max -0.8729 0.011 -82.724 0.000 -0.894 -0.852\n", - "purchase_date_min 0.5852 0.015 39.762 0.000 0.556 0.614\n", - "purchase_date_max -1.4162 0.016 -89.437 0.000 -1.447 -1.385\n", - "nb_tickets_internet 0.2884 0.013 22.603 0.000 0.263 0.313\n", - "is_email_true 0.7152 0.085 8.440 0.000 0.549 0.881\n", - "opt_in -1.9927 0.019 -107.165 0.000 -2.029 -1.956\n", - "gender_female 0.6880 0.024 28.907 0.000 0.641 0.735\n", - "gender_male 0.7914 0.024 32.720 0.000 0.744 0.839\n", - "nb_campaigns 0.2845 0.009 30.607 0.000 0.266 0.303\n", - "nb_campaigns_opened 0.2061 0.007 28.267 0.000 0.192 0.220\n", + "const -3.1162 0.081 -38.383 0.000 -3.275 -2.957\n", + "nb_tickets -0.0136 0.012 -1.156 0.248 -0.037 0.009\n", + "nb_purchases -0.0385 0.012 -3.149 0.002 -0.063 -0.015\n", + "total_amount 0.0588 0.018 3.325 0.001 0.024 0.094\n", + "nb_suppliers 0.1638 0.010 17.085 0.000 0.145 0.183\n", + "vente_internet_max -0.8651 0.011 -82.182 0.000 -0.886 -0.844\n", + "purchase_date_min 0.5790 0.015 39.391 0.000 0.550 0.608\n", + "purchase_date_max -1.4088 0.016 -89.101 0.000 -1.440 -1.378\n", + "nb_tickets_internet 0.2857 0.013 22.475 0.000 0.261 0.311\n", + "is_email_true 0.4224 0.079 5.363 0.000 0.268 0.577\n", + "opt_in -1.9818 0.019 -106.856 0.000 -2.018 -1.945\n", + "gender_female 0.6553 0.024 27.835 0.000 0.609 0.701\n", + "gender_male 0.7578 0.024 31.663 0.000 0.711 0.805\n", + "nb_campaigns 0.2835 0.009 30.547 0.000 0.265 0.302\n", + "nb_campaigns_opened 0.2061 0.007 28.315 0.000 0.192 0.220\n", "=======================================================================================\n" ] } @@ -2345,15 +2345,153 @@ "# pas besoin de redefinir le modèle, il faut faire un fit_regularized\n", "\n", "# sans spécification, le alpha optimal est déterminé par cross validation\n", - "# remplacer alpha=10 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n", + "# remplacer alpha=32 par la valeur optimale trouvée par cross validation dans la pipeline avec .best_params\n", "# attention, dans scikit learn, l'hyperparamètre est C = 1/alpha, pas oublier de prendre l'inverse de ce C optimal\n", "\n", - "result = model_logit.fit_regularized(method='l1', alpha = 10)\n", + "result = model_logit.fit_regularized(method='l1', alpha = 32)\n", "\n", "print(result.pvalues)\n", "print(result.summary())" ] }, + { + "cell_type": "markdown", + "id": "8c3dec50-7b9d-40f6-83b6-6cae26962cf8", + "metadata": {}, + "source": [ + "### Other method : take into account the weigths ! Pb : with this method, no penalty allowed" + ] + }, + { + "cell_type": "code", + "execution_count": 247, + "id": "2e3ca381-54e3-445b-bb37-d7ce953cb856", + "metadata": {}, + "outputs": [], + "source": [ + "# define a function to generate summaries of logit model\n", + "\n", + "def model_logit(X, y, weight_dict, add_constant=False) :\n", + " # Generate sample weights based on class weights computed earlier\n", + " sample_weights = np.array([weight_dict[class_] for class_ in y])\n", + "\n", + " if add_constant :\n", + " X_const = sm.add_constant(X)\n", + " else :\n", + " X_const = X\n", + " \n", + " # Use GLM from statsmodels with Binomial family for logistic regression\n", + " model = sm.GLM(y, X_const, family=sm.families.Binomial(), freq_weights=sample_weights)\n", + " \n", + " # fit without penalty\n", + " result = model.fit()\n", + "\n", + " result_summary = result.summary()\n", + " \n", + " return result_summary" + ] + }, + { + "cell_type": "code", + "execution_count": 248, + "id": "4cd424a0-7c55-47ff-840e-1354e8dcf863", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Generalized Linear Model Regression Results \n", + "==============================================================================\n", + "Dep. Variable: y No. Observations: 354365\n", + "Model: GLM Df Residuals: 354350\n", + "Model Family: Binomial Df Model: 14\n", + "Link Function: Logit Scale: 1.0000\n", + "Method: IRLS Log-Likelihood: -1.8693e+05\n", + "Date: Thu, 21 Mar 2024 Deviance: 3.7387e+05\n", + "Time: 13:19:33 Pearson chi2: 1.97e+16\n", + "No. Iterations: 100 Pseudo R-squ. (CS): 0.2820\n", + "Covariance Type: nonrobust \n", + "=======================================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "---------------------------------------------------------------------------------------\n", + "const -1.3943 0.062 -22.456 0.000 -1.516 -1.273\n", + "nb_tickets -0.3312 0.016 -20.967 0.000 -0.362 -0.300\n", + "nb_purchases 0.9258 0.098 9.491 0.000 0.735 1.117\n", + "total_amount 0.8922 0.042 21.393 0.000 0.810 0.974\n", + "nb_suppliers 0.2238 0.007 32.137 0.000 0.210 0.237\n", + "vente_internet_max -0.7453 0.007 -100.473 0.000 -0.760 -0.731\n", + "purchase_date_min 0.7123 0.015 46.063 0.000 0.682 0.743\n", + "purchase_date_max -1.3328 0.017 -79.297 0.000 -1.366 -1.300\n", + "nb_tickets_internet 0.1784 0.011 16.366 0.000 0.157 0.200\n", + "is_email_true 0.8635 0.061 14.086 0.000 0.743 0.984\n", + "opt_in -1.7487 0.010 -174.737 0.000 -1.768 -1.729\n", + "gender_female 0.8084 0.013 60.803 0.000 0.782 0.835\n", + "gender_male 0.8731 0.014 64.332 0.000 0.846 0.900\n", + "nb_campaigns 0.1751 0.006 31.101 0.000 0.164 0.186\n", + "nb_campaigns_opened 0.2962 0.005 54.145 0.000 0.285 0.307\n", + "=======================================================================================\n" + ] + } + ], + "source": [ + "# with the function\n", + "\n", + "# 1. logit with weights\n", + "results_logit_weight = model_logit(X,y,weight_dict=weight_dict)\n", + "print(results_logit_weight)" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "id": "84dd6242-a9c3-4dee-a58b-abc5f1c6f8fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Generalized Linear Model Regression Results \n", + "==============================================================================\n", + "Dep. Variable: y No. Observations: 354365\n", + "Model: GLM Df Residuals: 354350\n", + "Model Family: Binomial Df Model: 14\n", + "Link Function: Logit Scale: 1.0000\n", + "Method: IRLS Log-Likelihood: -83141.\n", + "Date: Thu, 21 Mar 2024 Deviance: 1.6628e+05\n", + "Time: 13:20:06 Pearson chi2: 4.52e+15\n", + "No. Iterations: 8 Pseudo R-squ. (CS): 0.1180\n", + "Covariance Type: nonrobust \n", + "=======================================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "---------------------------------------------------------------------------------------\n", + "const -3.6025 0.091 -39.755 0.000 -3.780 -3.425\n", + "nb_tickets -0.0230 0.010 -2.191 0.028 -0.044 -0.002\n", + "nb_purchases -0.0519 0.014 -3.609 0.000 -0.080 -0.024\n", + "total_amount 0.0799 0.021 3.841 0.000 0.039 0.121\n", + "nb_suppliers 0.1694 0.010 17.662 0.000 0.151 0.188\n", + "vente_internet_max -0.8764 0.011 -82.965 0.000 -0.897 -0.856\n", + "purchase_date_min 0.5881 0.015 39.936 0.000 0.559 0.617\n", + "purchase_date_max -1.4197 0.016 -89.592 0.000 -1.451 -1.389\n", + "nb_tickets_internet 0.2895 0.013 22.652 0.000 0.264 0.315\n", + "is_email_true 0.8651 0.088 9.797 0.000 0.692 1.038\n", + "opt_in -1.9976 0.019 -107.305 0.000 -2.034 -1.961\n", + "gender_female 0.7032 0.024 29.395 0.000 0.656 0.750\n", + "gender_male 0.8071 0.024 33.201 0.000 0.759 0.855\n", + "nb_campaigns 0.2850 0.009 30.633 0.000 0.267 0.303\n", + "nb_campaigns_opened 0.2061 0.007 28.245 0.000 0.192 0.220\n", + "=======================================================================================\n" + ] + } + ], + "source": [ + "# 2. logit without weights\n", + "\n", + "results_logit = model_logit(X.drop(\"const\", axis=1),y,weight_dict={0:1, 1:1}, add_constant=True)\n", + "print(results_logit)" + ] + }, { "cell_type": "markdown", "id": "36c5e770-72b3-4482-ad61-45b511a11f06",