From 58c7cac17f35421186affc8b66da7dbdac14760f Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Sun, 10 Mar 2024 10:09:53 +0000 Subject: [PATCH] work on pipeline --- Sport/Modelization/2_Modelization_sport.ipynb | 203 ++++++++++++++++-- 1 file changed, 184 insertions(+), 19 deletions(-) diff --git a/Sport/Modelization/2_Modelization_sport.ipynb b/Sport/Modelization/2_Modelization_sport.ipynb index ece151c..a3d0476 100644 --- a/Sport/Modelization/2_Modelization_sport.ipynb +++ b/Sport/Modelization/2_Modelization_sport.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 106, "id": "f271eb45-1470-4764-8c2e-31374efa1fe5", "metadata": {}, "outputs": [], @@ -23,6 +23,7 @@ "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n", + "from sklearn.utils import class_weight\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", @@ -34,10 +35,25 @@ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n", + "from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n", + "\n", "import pickle\n", + "import warnings\n", "#import scikitplot as skplt" ] }, + { + "cell_type": "code", + "execution_count": 107, + "id": "3fecb606-22e5-4dee-8efa-f8dff0832299", + "metadata": {}, + "outputs": [], + "source": [ + "warnings.filterwarnings('ignore')\n", + "warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n", + "warnings.filterwarnings(\"ignore\", category=DataConversionWarning)" + ] + }, { "cell_type": "markdown", "id": "ae591854-3003-4c75-a0c7-5abf04246e81", @@ -48,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 108, "id": "59dd4694-a812-4923-b995-a2ee86c74f85", "metadata": {}, "outputs": [], @@ -60,21 +76,21 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 109, "id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3", "metadata": {}, "outputs": [], "source": [ "def load_train_test():\n", - " BUCKET = \"projet-bdc2324-team1/Generalization/sport/\"\n", - " File_path_train = BUCKET + \"/\" + \"dataset_train.csv\"\n", - " File_path_test = BUCKET + \"/\" + \"dataset_train.csv\"\n", + " BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n", + " File_path_train = BUCKET + \"/\" + \"Train_set.csv\"\n", + " File_path_test = BUCKET + \"/\" + \"Test_set.csv\"\n", " \n", - " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " with fs.open( File_path_train, mode=\"rb\") as file_in:\n", " dataset_train = pd.read_csv(file_in, sep=\",\")\n", " dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n", "\n", - " with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n", + " with fs.open(File_path_test, mode=\"rb\") as file_in:\n", " dataset_test = pd.read_csv(file_in, sep=\",\")\n", " dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n", " \n", @@ -83,20 +99,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 110, "id": "825d14a3-6967-4733-bfd4-64bf61c2bd43", "metadata": {}, "outputs": [], "source": [ "def features_target_split(dataset_train, dataset_test):\n", - " X_train = dataset_train[]\n", - " y_train = dataset_train['y_has_purchased']\n", + " features_l = ['nb_tickets', 'nb_purchases', 'total_amount',\n", + " 'nb_suppliers', 'nb_tickets_internet',\n", + " 'opt_in',\n", + " 'nb_campaigns', 'nb_campaigns_opened']\n", + " X_train = dataset_train[features_l]\n", + " y_train = dataset_train[['y_has_purchased']]\n", "\n", - " X_test = dataset_test[]\n", - " y_test = dataset_test['y_has_purchased']\n", + " X_test = dataset_test[features_l]\n", + " y_test = dataset_test[['y_has_purchased']]\n", " return X_train, X_test, y_train, y_test" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "c479b230-b4bd-4cfb-b76b-d9faf6d95772", + "metadata": {}, + "outputs": [], + "source": [ + "dataset_train, dataset_test = load_train_test()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69eaec12-b30f-4d30-a461-ea520d5cbf77", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)" + ] + }, { "cell_type": "markdown", "id": "a1d6de94-4e11-481a-a0ce-412bf29f692c", @@ -105,6 +145,21 @@ "### Prepare preprocessing and Hyperparameters" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b808da43-c444-4e94-995a-7ec6ccd01e2d", + "metadata": {}, + "outputs": [], + "source": [ + "# Compute Weights\n", + "weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n", + " y = y_train['y_has_purchased'])\n", + "\n", + "weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n", + "weight_dict" + ] + }, { "cell_type": "code", "execution_count": null, @@ -113,13 +168,27 @@ "outputs": [], "source": [ "numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n", - " 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n", + " 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']\n", "\n", "numeric_transformer = Pipeline(steps=[\n", - " # (\"imputer\", SimpleImputer(strategy=\"mean\")), # NaN remplacés par la moyenne, mais peu importe car on a supprimé les valeurs manquantes\n", - " (\"scaler\", StandardScaler())])\n", + " #(\"imputer\", SimpleImputer(strategy=\"mean\")), \n", + " (\"scaler\", StandardScaler()) \n", + "])\n", "\n", - "preproc = ColumnTransformer(transformers=[(\"num\", numeric_transformer, numeric_features)])" + "categorical_features = ['opt_in'] \n", + "\n", + "# Transformer for the categorical features\n", + "categorical_transformer = Pipeline(steps=[\n", + " #(\"imputer\", SimpleImputer(strategy=\"most_frequent\")), # Impute missing values with the most frequent\n", + " (\"onehot\", OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n", + "])\n", + "\n", + "preproc = ColumnTransformer(\n", + " transformers=[\n", + " (\"num\", numeric_transformer, numeric_features),\n", + " (\"cat\", categorical_transformer, categorical_features)\n", + " ]\n", + ")" ] }, { @@ -144,9 +213,105 @@ "source": [ "# Hyperparameter\n", "\n", - "parameters4 = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", - " 'logreg__class_weight': ['balanced']} " + "param_grid = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n", + " 'logreg__penalty': ['l2', 'L1'],\n", + " 'logreg__class_weight': ['balanced', weight_dict]} " ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ff2f7bd-efc1-4f7c-a3c9-caa916aa2f2b", + "metadata": {}, + "outputs": [], + "source": [ + "# Pipeline\n", + "\n", + "pipeline = Pipeline(steps=[\n", + " ('preprocessor', preproc),\n", + " ('logreg', LogisticRegression(solver='saga', max_iter=1000)) \n", + "])\n", + "\n", + "pipeline.set_output(transform=\"pandas\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b467511-2ae5-4a16-a502-397c3460471d", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6356e870-0dfc-4e60-9e48-e2de5e7f9f87", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = pipeline.predict(X_test)\n", + "\n", + "# Calculate the F1 score\n", + "f1 = f1_score(y_test, y_pred)\n", + "print(f\"F1 Score: {f1}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09387a09-0d53-4c54-baac-f3c2a57a629a", + "metadata": {}, + "outputs": [], + "source": [ + "conf_matrix = confusion_matrix(y_test, y_pred)\n", + "sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n", + "plt.xlabel('Predicted')\n", + "plt.ylabel('Actual')\n", + "plt.title('Confusion Matrix')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ae8e9bd3-0f6a-4f82-bb4c-470cbdc8d6bb", + "metadata": {}, + "source": [ + "## Cross Validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7fca463-d7d6-493b-8329-fdfa92457f78", + "metadata": {}, + "outputs": [], + "source": [ + "# Cross validation\n", + "y_train = y_train['y_has_purchased']\n", + "grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer, error_score='raise',\n", + " n_jobs=-1)\n", + "\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "# Print the best parameters and the best score\n", + "print(\"Best parameters found: \", grid_search.best_params_)\n", + "print(\"Best cross-validation score: {:.2f}\".format(grid_search.best_score_))\n", + "\n", + "# Evaluate the best model on the test set\n", + "test_score = grid_search.score(X_test, y_test)\n", + "print(\"Test set score: {:.2f}\".format(test_score))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56bd7828-4de1-4166-bea0-5d5e152b9d38", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {