diff --git a/Sport/Modelization/2_Modelization_sport.ipynb b/Sport/Modelization/2_Modelization_sport.ipynb index f335ac3..b3ff399 100644 --- a/Sport/Modelization/2_Modelization_sport.ipynb +++ b/Sport/Modelization/2_Modelization_sport.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 1, "id": "f271eb45-1470-4764-8c2e-31374efa1fe5", "metadata": {}, "outputs": [], @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 2, "id": "3fecb606-22e5-4dee-8efa-f8dff0832299", "metadata": {}, "outputs": [], @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 3, "id": "59dd4694-a812-4923-b995-a2ee86c74f85", "metadata": {}, "outputs": [], @@ -76,39 +76,109 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 4, "id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3", "metadata": {}, "outputs": [], "source": [ "def load_train_test():\n", " BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n", - " File_path_train = BUCKET + \"/Train_set/\" + \"dataset_train5.csv\"\n", - " File_path_test = BUCKET + \"/Test_set/\" + \"dataset_test5.csv\"\n", + " File_path_train = BUCKET + \"/Train_set.csv\"\n", + " File_path_test = BUCKET + \"/Test_set.csv\"\n", " \n", " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", - " dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", + " # dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", "\n", " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n", - " dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", + " # dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", " \n", " return dataset_train, dataset_test" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 5, + "id": "c479b230-b4bd-4cfb-b76b-d9faf6d95772", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_train, dataset_test = load_train_test()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c24c446d-4e1c-4ac1-a048-f0b8d8559f36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "customer_id 0\n", + "nb_tickets 0\n", + "nb_purchases 0\n", + "total_amount 0\n", + "nb_suppliers 0\n", + "vente_internet_max 0\n", + "purchase_date_min 0\n", + "purchase_date_max 0\n", + "time_between_purchase 0\n", + "nb_tickets_internet 0\n", + "street_id 0\n", + "structure_id 222825\n", + "mcp_contact_id 70874\n", + "fidelity 0\n", + "tenant_id 0\n", + "is_partner 0\n", + "deleted_at 224213\n", + "gender 0\n", + "is_email_true 0\n", + "opt_in 0\n", + "last_buying_date 66139\n", + "max_price 66139\n", + "ticket_sum 0\n", + "average_price 66023\n", + "average_purchase_delay 66139\n", + "average_price_basket 66139\n", + "average_ticket_basket 66139\n", + "total_price 116\n", + "purchase_count 0\n", + "first_buying_date 66139\n", + "country 23159\n", + "gender_label 0\n", + "gender_female 0\n", + "gender_male 0\n", + "gender_other 0\n", + "country_fr 23159\n", + "nb_campaigns 0\n", + "nb_campaigns_opened 0\n", + "time_to_open 123159\n", + "y_has_purchased 0\n", + "dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset_train.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "id": "825d14a3-6967-4733-bfd4-64bf61c2bd43", "metadata": {}, "outputs": [], "source": [ "def features_target_split(dataset_train, dataset_test):\n", - " features_l = ['nb_tickets', 'nb_purchases', 'total_amount',\n", - " 'nb_suppliers', 'nb_tickets_internet',\n", - " 'opt_in',\n", - " 'nb_campaigns', 'nb_campaigns_opened']\n", + " features_l = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n", + " 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n", + " 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n", " X_train = dataset_train[features_l]\n", " y_train = dataset_train[['y_has_purchased']]\n", "\n", @@ -119,17 +189,7 @@ }, { "cell_type": "code", - "execution_count": 27, - "id": "c479b230-b4bd-4cfb-b76b-d9faf6d95772", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_train, dataset_test = load_train_test()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, + "execution_count": 8, "id": "69eaec12-b30f-4d30-a461-ea520d5cbf77", "metadata": {}, "outputs": [], @@ -139,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 9, "id": "d039f31d-0093-46c6-9743-ddec1381f758", "metadata": {}, "outputs": [ @@ -147,8 +207,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Shape train : (330117, 8)\n", - "Shape test : (141480, 8)\n" + "Shape train : (224213, 17)\n", + "Shape test : (96096, 17)\n" ] } ], @@ -167,17 +227,17 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 10, "id": "b808da43-c444-4e94-995a-7ec6ccd01e2d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{0.0: 0.5381774965030861, 1.0: 7.048360235716116}" + "{0.0: 0.5837086520288036, 1.0: 3.486549107420539}" ] }, - "execution_count": 30, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -193,13 +253,14 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 11, "id": "b32a79ea-907f-4dfc-9832-6c74bef3200c", "metadata": {}, "outputs": [], "source": [ - "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n", - " 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']\n", + "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers', 'vente_internet_max', 'purchase_date_min', 'purchase_date_max', \n", + " 'time_between_purchase', 'nb_tickets_internet', 'fidelity', 'is_email_true', 'opt_in', #'is_partner',\n", + " 'gender_female', 'gender_male', 'gender_other', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", "numeric_transformer = Pipeline(steps=[\n", " #(\"imputer\", SimpleImputer(strategy=\"mean\")), \n", @@ -224,20 +285,19 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 12, "id": "9809a688-bfbc-4685-a77f-17a8b2b79ab3", "metadata": {}, "outputs": [], "source": [ "# Set loss\n", - "\n", "balanced_scorer = make_scorer(balanced_accuracy_score)\n", "recall_scorer = make_scorer(recall_score)" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 13, "id": "4f9b2bbf-5f8a-4ac1-8e6c-51bd0dd8ac85", "metadata": {}, "outputs": [], @@ -273,13 +333,47 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 14, + "id": "cf400c70-0192-42cc-9919-f61bae8382b0", + "metadata": {}, + "outputs": [], + "source": [ + "def draw_features_importance(pipeline, model):\n", + " coefficients = pipeline.named_steps['logreg'].coef_[0]\n", + " feature_names = pipeline.named_steps['logreg'].feature_names_in_\n", + " \n", + " # Tracer l'importance des caractéristiques\n", + " plt.figure(figsize=(10, 6))\n", + " plt.barh(feature_names, coefficients, color='skyblue')\n", + " plt.xlabel('Importance des caractéristiques')\n", + " plt.ylabel('Caractéristiques')\n", + " plt.title('Importance des caractéristiques dans le modèle de régression logistique')\n", + " plt.grid(True)\n", + " plt.show()\n", + "\n", + "def draw_prob_distribution(X_test):\n", + " y_pred_prob = pipeline.predict_proba(X_test)[:, 1]\n", + " plt.figure(figsize=(8, 6))\n", + " plt.hist(y_pred_prob, bins=10, range=(0, 1), color='blue', alpha=0.7)\n", + " \n", + " plt.xlim(0, 1)\n", + " plt.ylim(0, None)\n", + " \n", + " plt.title('Histogramme des probabilités pour la classe 1')\n", + " plt.xlabel('Probabilité')\n", + " plt.ylabel('Fréquence')\n", + " plt.grid(True)\n", + " plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "id": "206d9a95-7c37-4506-949b-e77d225e42c5", "metadata": {}, "outputs": [], "source": [ "# Hyperparameter\n", - "\n", "param_grid = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", " 'logreg__penalty': ['l1', 'l2'],\n", " 'logreg__class_weight': ['balanced', weight_dict]} " @@ -287,14 +381,14 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 16, "id": "7ff2f7bd-efc1-4f7c-a3c9-caa916aa2f2b", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Pipeline(steps=[('preprocessor',\n",
+       "
Pipeline(steps=[('preprocessor',\n",
        "                 ColumnTransformer(transformers=[('num',\n",
        "                                                  Pipeline(steps=[('scaler',\n",
        "                                                                   StandardScaler())]),\n",
        "                                                  ['nb_tickets', 'nb_purchases',\n",
        "                                                   'total_amount',\n",
        "                                                   'nb_suppliers',\n",
+       "                                                   'vente_internet_max',\n",
+       "                                                   'purchase_date_min',\n",
+       "                                                   'purchase_date_max',\n",
+       "                                                   'time_between_purchase',\n",
        "                                                   'nb_tickets_internet',\n",
+       "                                                   'fidelity', 'is_email_true',\n",
+       "                                                   'opt_in', 'gender_female',\n",
+       "                                                   'gender_male',\n",
+       "                                                   'gender_other',\n",
        "                                                   'nb_campaigns',\n",
        "                                                   'nb_campaigns_opened']),\n",
        "                                                 ('cat',\n",
@@ -714,16 +816,24 @@
        "                                                                                 sparse_output=False))]),\n",
        "                                                  ['opt_in'])])),\n",
        "                ('logreg',\n",
-       "                 LogisticRegression(class_weight={0.0: 0.5381774965030861,\n",
-       "                                                  1.0: 7.048360235716116},\n",
-       "                                    max_iter=5000, solver='saga'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.