work on pipeline
This commit is contained in:
parent
198ef45247
commit
58c7cac17f
|
@ -10,7 +10,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 106,
|
||||||
"id": "f271eb45-1470-4764-8c2e-31374efa1fe5",
|
"id": "f271eb45-1470-4764-8c2e-31374efa1fe5",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
@ -23,6 +23,7 @@
|
||||||
"from sklearn.linear_model import LogisticRegression\n",
|
"from sklearn.linear_model import LogisticRegression\n",
|
||||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||||
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
|
"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\n",
|
||||||
|
"from sklearn.utils import class_weight\n",
|
||||||
"from sklearn.neighbors import KNeighborsClassifier\n",
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||||||
"from sklearn.pipeline import Pipeline\n",
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
"from sklearn.compose import ColumnTransformer\n",
|
"from sklearn.compose import ColumnTransformer\n",
|
||||||
|
@ -34,10 +35,25 @@
|
||||||
"import seaborn as sns\n",
|
"import seaborn as sns\n",
|
||||||
"import matplotlib.pyplot as plt\n",
|
"import matplotlib.pyplot as plt\n",
|
||||||
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
|
"from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score\n",
|
||||||
|
"from sklearn.exceptions import ConvergenceWarning, DataConversionWarning\n",
|
||||||
|
"\n",
|
||||||
"import pickle\n",
|
"import pickle\n",
|
||||||
|
"import warnings\n",
|
||||||
"#import scikitplot as skplt"
|
"#import scikitplot as skplt"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 107,
|
||||||
|
"id": "3fecb606-22e5-4dee-8efa-f8dff0832299",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"warnings.filterwarnings('ignore')\n",
|
||||||
|
"warnings.filterwarnings(\"ignore\", category=ConvergenceWarning)\n",
|
||||||
|
"warnings.filterwarnings(\"ignore\", category=DataConversionWarning)"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "ae591854-3003-4c75-a0c7-5abf04246e81",
|
"id": "ae591854-3003-4c75-a0c7-5abf04246e81",
|
||||||
|
@ -48,7 +64,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 108,
|
||||||
"id": "59dd4694-a812-4923-b995-a2ee86c74f85",
|
"id": "59dd4694-a812-4923-b995-a2ee86c74f85",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
@ -60,21 +76,21 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 109,
|
||||||
"id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3",
|
"id": "017f7e9a-3ba0-40fa-bdc8-51b98cc1fdb3",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def load_train_test():\n",
|
"def load_train_test():\n",
|
||||||
" BUCKET = \"projet-bdc2324-team1/Generalization/sport/\"\n",
|
" BUCKET = \"projet-bdc2324-team1/Generalization/sport\"\n",
|
||||||
" File_path_train = BUCKET + \"/\" + \"dataset_train.csv\"\n",
|
" File_path_train = BUCKET + \"/\" + \"Train_set.csv\"\n",
|
||||||
" File_path_test = BUCKET + \"/\" + \"dataset_train.csv\"\n",
|
" File_path_test = BUCKET + \"/\" + \"Test_set.csv\"\n",
|
||||||
" \n",
|
" \n",
|
||||||
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
" with fs.open( File_path_train, mode=\"rb\") as file_in:\n",
|
||||||
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
" dataset_train = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
" dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
|
" dataset_train['y_has_purchased'] = dataset_train['y_has_purchased'].fillna(0)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" with fs.open(FILE_PATH_S3, mode=\"rb\") as file_in:\n",
|
" with fs.open(File_path_test, mode=\"rb\") as file_in:\n",
|
||||||
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
|
" dataset_test = pd.read_csv(file_in, sep=\",\")\n",
|
||||||
" dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
|
" dataset_test['y_has_purchased'] = dataset_test['y_has_purchased'].fillna(0)\n",
|
||||||
" \n",
|
" \n",
|
||||||
|
@ -83,20 +99,44 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 110,
|
||||||
"id": "825d14a3-6967-4733-bfd4-64bf61c2bd43",
|
"id": "825d14a3-6967-4733-bfd4-64bf61c2bd43",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def features_target_split(dataset_train, dataset_test):\n",
|
"def features_target_split(dataset_train, dataset_test):\n",
|
||||||
" X_train = dataset_train[]\n",
|
" features_l = ['nb_tickets', 'nb_purchases', 'total_amount',\n",
|
||||||
" y_train = dataset_train['y_has_purchased']\n",
|
" 'nb_suppliers', 'nb_tickets_internet',\n",
|
||||||
|
" 'opt_in',\n",
|
||||||
|
" 'nb_campaigns', 'nb_campaigns_opened']\n",
|
||||||
|
" X_train = dataset_train[features_l]\n",
|
||||||
|
" y_train = dataset_train[['y_has_purchased']]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" X_test = dataset_test[]\n",
|
" X_test = dataset_test[features_l]\n",
|
||||||
" y_test = dataset_test['y_has_purchased']\n",
|
" y_test = dataset_test[['y_has_purchased']]\n",
|
||||||
" return X_train, X_test, y_train, y_test"
|
" return X_train, X_test, y_train, y_test"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c479b230-b4bd-4cfb-b76b-d9faf6d95772",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dataset_train, dataset_test = load_train_test()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "69eaec12-b30f-4d30-a461-ea520d5cbf77",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X_train, X_test, y_train, y_test = features_target_split(dataset_train, dataset_test)"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "a1d6de94-4e11-481a-a0ce-412bf29f692c",
|
"id": "a1d6de94-4e11-481a-a0ce-412bf29f692c",
|
||||||
|
@ -105,6 +145,21 @@
|
||||||
"### Prepare preprocessing and Hyperparameters"
|
"### Prepare preprocessing and Hyperparameters"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b808da43-c444-4e94-995a-7ec6ccd01e2d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Compute Weights\n",
|
||||||
|
"weights = class_weight.compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train['y_has_purchased']),\n",
|
||||||
|
" y = y_train['y_has_purchased'])\n",
|
||||||
|
"\n",
|
||||||
|
"weight_dict = {np.unique(y_train['y_has_purchased'])[i]: weights[i] for i in range(len(np.unique(y_train['y_has_purchased'])))}\n",
|
||||||
|
"weight_dict"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
@ -113,13 +168,27 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
|
"numeric_features = ['nb_tickets', 'nb_purchases', 'total_amount', 'nb_suppliers',\n",
|
||||||
" 'nb_tickets_internet', 'fidelity', 'nb_campaigns', 'nb_campaigns_opened']\n",
|
" 'nb_tickets_internet', 'nb_campaigns', 'nb_campaigns_opened']\n",
|
||||||
"\n",
|
"\n",
|
||||||
"numeric_transformer = Pipeline(steps=[\n",
|
"numeric_transformer = Pipeline(steps=[\n",
|
||||||
" # (\"imputer\", SimpleImputer(strategy=\"mean\")), # NaN remplacés par la moyenne, mais peu importe car on a supprimé les valeurs manquantes\n",
|
" #(\"imputer\", SimpleImputer(strategy=\"mean\")), \n",
|
||||||
" (\"scaler\", StandardScaler())])\n",
|
" (\"scaler\", StandardScaler()) \n",
|
||||||
|
"])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"preproc = ColumnTransformer(transformers=[(\"num\", numeric_transformer, numeric_features)])"
|
"categorical_features = ['opt_in'] \n",
|
||||||
|
"\n",
|
||||||
|
"# Transformer for the categorical features\n",
|
||||||
|
"categorical_transformer = Pipeline(steps=[\n",
|
||||||
|
" #(\"imputer\", SimpleImputer(strategy=\"most_frequent\")), # Impute missing values with the most frequent\n",
|
||||||
|
" (\"onehot\", OneHotEncoder(handle_unknown='ignore', sparse_output=False))\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"preproc = ColumnTransformer(\n",
|
||||||
|
" transformers=[\n",
|
||||||
|
" (\"num\", numeric_transformer, numeric_features),\n",
|
||||||
|
" (\"cat\", categorical_transformer, categorical_features)\n",
|
||||||
|
" ]\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -144,9 +213,105 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# Hyperparameter\n",
|
"# Hyperparameter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"parameters4 = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n",
|
"param_grid = {'logreg__C': np.logspace(-10, 6, 17, base=2),\n",
|
||||||
" 'logreg__class_weight': ['balanced']} "
|
" 'logreg__penalty': ['l2', 'L1'],\n",
|
||||||
|
" 'logreg__class_weight': ['balanced', weight_dict]} "
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7ff2f7bd-efc1-4f7c-a3c9-caa916aa2f2b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Pipeline\n",
|
||||||
|
"\n",
|
||||||
|
"pipeline = Pipeline(steps=[\n",
|
||||||
|
" ('preprocessor', preproc),\n",
|
||||||
|
" ('logreg', LogisticRegression(solver='saga', max_iter=1000)) \n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"pipeline.set_output(transform=\"pandas\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2b467511-2ae5-4a16-a502-397c3460471d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pipeline.fit(X_train, y_train)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6356e870-0dfc-4e60-9e48-e2de5e7f9f87",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"y_pred = pipeline.predict(X_test)\n",
|
||||||
|
"\n",
|
||||||
|
"# Calculate the F1 score\n",
|
||||||
|
"f1 = f1_score(y_test, y_pred)\n",
|
||||||
|
"print(f\"F1 Score: {f1}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "09387a09-0d53-4c54-baac-f3c2a57a629a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
|
||||||
|
"sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])\n",
|
||||||
|
"plt.xlabel('Predicted')\n",
|
||||||
|
"plt.ylabel('Actual')\n",
|
||||||
|
"plt.title('Confusion Matrix')\n",
|
||||||
|
"plt.show()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ae8e9bd3-0f6a-4f82-bb4c-470cbdc8d6bb",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Cross Validation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f7fca463-d7d6-493b-8329-fdfa92457f78",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Cross validation\n",
|
||||||
|
"y_train = y_train['y_has_purchased']\n",
|
||||||
|
"grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f1_scorer, error_score='raise',\n",
|
||||||
|
" n_jobs=-1)\n",
|
||||||
|
"\n",
|
||||||
|
"grid_search.fit(X_train, y_train)\n",
|
||||||
|
"\n",
|
||||||
|
"# Print the best parameters and the best score\n",
|
||||||
|
"print(\"Best parameters found: \", grid_search.best_params_)\n",
|
||||||
|
"print(\"Best cross-validation score: {:.2f}\".format(grid_search.best_score_))\n",
|
||||||
|
"\n",
|
||||||
|
"# Evaluate the best model on the test set\n",
|
||||||
|
"test_score = grid_search.score(X_test, y_test)\n",
|
||||||
|
"print(\"Test set score: {:.2f}\".format(test_score))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "56bd7828-4de1-4166-bea0-5d5e152b9d38",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user