{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "314bf34b-1f6d-4a99-8f82-aa71ebacdabc", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import s3fs\n", "import warnings\n", "from datetime import date, timedelta, datetime\n", "import numpy as np\n", "\n", "exec(open('../0_KPI_functions.py').read())" ] }, { "cell_type": "code", "execution_count": 2, "id": "a276822a-c389-429e-b249-8a9e47758bfc", "metadata": {}, "outputs": [], "source": [ "# Ignore warning\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 31, "id": "f62b996c-4e17-40ea-83ba-f0cb60be7671", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/1',\n", " 'bdc2324-data/10',\n", " 'bdc2324-data/101',\n", " 'bdc2324-data/11',\n", " 'bdc2324-data/12',\n", " 'bdc2324-data/13',\n", " 'bdc2324-data/14',\n", " 'bdc2324-data/2',\n", " 'bdc2324-data/3',\n", " 'bdc2324-data/4',\n", " 'bdc2324-data/5',\n", " 'bdc2324-data/6',\n", " 'bdc2324-data/7',\n", " 'bdc2324-data/8',\n", " 'bdc2324-data/9']" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create filesystem object\n", "S3_ENDPOINT_URL = \"https://\" + os.environ[\"AWS_S3_ENDPOINT\"]\n", "fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})\n", "\n", "BUCKET = \"bdc2324-data\"\n", "fs.ls(BUCKET)" ] }, { "cell_type": "markdown", "id": "2c829aa8-2006-4e72-889b-7096dd55718b", "metadata": {}, "source": [ "## Look at the time sequence of each company and compute inter time coverage" ] }, { "cell_type": "code", "execution_count": 73, "id": "e86864b7-4852-449a-8680-638559d56080", "metadata": {}, "outputs": [], "source": [ "sport = ['5', '6', '7', '8', '9']" ] }, { "cell_type": "code", "execution_count": 90, "id": "7634ec57-4891-4684-8638-1e1643baca28", "metadata": {}, "outputs": [], "source": [ "def display_covering_time(df, company, datecover):\n", " \"\"\"\n", " This function draws the time coverage of each company\n", " \"\"\"\n", " min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n", " max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n", " datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n", " print(f'Couverture Company {company} : {min_date} - {max_date}')\n", " return datecover" ] }, { "cell_type": "code", "execution_count": 91, "id": "53c83f51-822c-4e05-8c7c-89aa327603c6", "metadata": {}, "outputs": [], "source": [ "def compute_time_intersection(datecover):\n", " timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n", " intersection = set.intersection(*timestamps_sets)\n", " intersection_list = list(intersection)\n", " formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n", " return sorted(formated_dates)" ] }, { "cell_type": "code", "execution_count": 93, "id": "eec152de-078e-44c4-ad6e-74ae6ba5c65a", "metadata": {}, "outputs": [], "source": [ "def df_coverage_modelization(sport, coverage_train = 0.7):\n", " \"\"\"\n", " This function returns start_date, end_of_features and final dates\n", " that help to construct train and test datasets\n", " \"\"\"\n", " datecover = {}\n", " for company in sport:\n", " df_products_purchased_reduced = display_databases(company, file_name = \"products_purchased_reduced\",\n", " datetime_col = ['purchase_date'])\n", " datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n", " #print(datecover.keys())\n", " dt_coverage = compute_time_intersection(datecover)\n", " start_date = dt_coverage[0]\n", " end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n", " final_date = dt_coverage[-1]\n", " return start_date, end_of_features, final_date\n", " " ] }, { "cell_type": "code", "execution_count": 94, "id": "348f246a-bc2d-4bbc-ba05-aa825da15a69", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_5/products_purchased_reduced.csv\n", "Couverture Company 5 : 2019-04-15 - 2023-11-09\n", "File path : projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n", "Couverture Company 6 : 2018-06-28 - 2023-11-08\n", "File path : projet-bdc2324-team1/0_Input/Company_7/products_purchased_reduced.csv\n", "Couverture Company 7 : 2015-02-10 - 2023-11-08\n", "File path : projet-bdc2324-team1/0_Input/Company_8/products_purchased_reduced.csv\n", "Couverture Company 8 : 2010-09-28 - 2023-11-08\n", "File path : projet-bdc2324-team1/0_Input/Company_9/products_purchased_reduced.csv\n", "Couverture Company 9 : 2014-09-22 - 2023-10-24\n", "dict_keys(['5', '6', '7', '8', '9'])\n", "2019-04-15 2022-06-15 2023-10-23\n" ] } ], "source": [ "start_date, end_of_features, final_date = df_coverage_modelization(sport, coverage_train = 0.7)\n", "print(start_date, end_of_features, final_date )" ] }, { "cell_type": "markdown", "id": "34ddc267-4daa-4926-9d54-5b13d4212eaa", "metadata": {}, "source": [ "## Look at common database between Sport companies" ] }, { "cell_type": "code", "execution_count": 101, "id": "389387fa-2046-4811-b8dd-6d524e91fe2e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bdc2324-data/5',\n", " 'bdc2324-data/6',\n", " 'bdc2324-data/7',\n", " 'bdc2324-data/8',\n", " 'bdc2324-data/9']" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "companies = fs.ls(BUCKET)\n", "companies = [company for company in companies if any(company.endswith(end) for end in sport)]\n", "companies" ] }, { "cell_type": "code", "execution_count": 107, "id": "895fc2b3-c768-454d-bedb-54994e4d211a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of databases : 30\n", "Number of common databases : 23\n" ] } ], "source": [ "companies_database = {}\n", "\n", "for company in companies:\n", " companies_database[company.split('/')[-1]] = [file.split('/')[-1].replace(company.split('/')[-1], '') for file in fs.ls(company)] \n", "\n", "all_database = companies_database[max(companies_database, key=lambda x: len(companies_database[x]))]\n", "print(\"Number of databases : \",len(all_database))\n", "\n", "data_in_common = set(all_database)\n", "\n", "for key in companies_database:\n", " diff_database = data_in_common.symmetric_difference(companies_database[key])\n", " data_in_common = data_in_common - diff_database\n", "\n", "print(\"Number of common databases : \",len(data_in_common))" ] }, { "cell_type": "code", "execution_count": 121, "id": "0c06517d-f5b7-4104-94fa-0e3f843c5881", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'campaign_stats.csv',\n", " 'campaigns.csv',\n", " 'categories.csv',\n", " 'countries.csv',\n", " 'currencies.csv',\n", " 'customer_target_mappings.csv',\n", " 'customersplus.csv',\n", " 'event_types.csv',\n", " 'events.csv',\n", " 'facilities.csv',\n", " 'link_stats.csv',\n", " 'pricing_formulas.csv',\n", " 'product_packs.csv',\n", " 'products.csv',\n", " 'products_groups.csv',\n", " 'purchases.csv',\n", " 'representation_category_capacities.csv',\n", " 'representations.csv',\n", " 'seasons.csv',\n", " 'suppliers.csv',\n", " 'target_types.csv',\n", " 'targets.csv',\n", " 'tickets.csv'}" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_in_common" ] }, { "cell_type": "markdown", "id": "1af245aa-44a7-453b-90f9-0c4bcc415cd0", "metadata": {}, "source": [ "## Investigate errors from data construction for company 6" ] }, { "cell_type": "code", "execution_count": 108, "id": "538a5ca2-a50d-4726-93eb-c2b0d0ab8400", "metadata": {}, "outputs": [], "source": [ "directory_path = '6'" ] }, { "cell_type": "code", "execution_count": 143, "id": "1ca3fb71-930a-441c-b35b-b98bca780606", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_6/customerplus_cleaned.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n", "File path : projet-bdc2324-team1/0_Input/Company_6/products_purchased_reduced.csv\n" ] } ], "source": [ "df_customerplus_clean = display_databases(directory_path, file_name = \"customerplus_cleaned\")\n", "df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])\n", "df_products_purchased_reduced = display_databases(directory_path, file_name = \"products_purchased_reduced\", datetime_col = ['purchase_date'])" ] }, { "cell_type": "code", "execution_count": 144, "id": "2ad3052c-e9e6-4ef9-abe2-4b8b2306a2b9", "metadata": {}, "outputs": [], "source": [ "max_date = pd.to_datetime(final_date, utc = True, format = 'ISO8601') \n", "end_features_date = pd.to_datetime(end_of_features, utc = True, format = 'ISO8601')\n", "min_date = pd.to_datetime(start_date, utc = True, format = 'ISO8601')" ] }, { "cell_type": "code", "execution_count": 128, "id": "146999f2-ab92-4b7c-8c57-2e3ac8c4dd88", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/0_Input/Company_6/campaigns_information.csv\n" ] } ], "source": [ "df_campaigns_information = display_databases(directory_path, file_name = \"campaigns_information\", datetime_col = ['opened_at', 'sent_at', 'campaign_sent_at'])" ] }, { "cell_type": "code", "execution_count": 133, "id": "7448a7b9-3edf-4177-9df2-a260ebbee45e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timestamp('2022-06-15 00:00:00+0000', tz='UTC')" ] }, "execution_count": 133, "metadata": {}, "output_type": "execute_result" } ], "source": [ "end_features_date" ] }, { "cell_type": "code", "execution_count": 136, "id": "d8e954ab-65d4-4f36-8410-69bf664773a7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape campaigns_information : (1333010, 8)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
0138NaT2022-08-02 18:31:33+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
1226135NaT2022-08-02 18:31:34+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
233876NaT2022-08-02 18:31:35+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
3426226NaT2022-08-02 18:31:35+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
4525349NaT2022-08-02 18:31:34+00:00NaNAdhérents non ré-engagés152022-08-02 18:31:36+00:00
\n", "
" ], "text/plain": [ " id customer_id opened_at sent_at delivered_at \\\n", "0 1 38 NaT 2022-08-02 18:31:33+00:00 NaN \n", "1 2 26135 NaT 2022-08-02 18:31:34+00:00 NaN \n", "2 3 3876 NaT 2022-08-02 18:31:35+00:00 NaN \n", "3 4 26226 NaT 2022-08-02 18:31:35+00:00 NaN \n", "4 5 25349 NaT 2022-08-02 18:31:34+00:00 NaN \n", "\n", " campaign_name campaign_service_id campaign_sent_at \n", "0 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n", "1 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n", "2 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n", "3 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 \n", "4 Adhérents non ré-engagés 15 2022-08-02 18:31:36+00:00 " ] }, "execution_count": 136, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(\"Shape campaigns_information : \", df_campaigns_information.shape)\n", "df_campaigns_information.head()" ] }, { "cell_type": "code", "execution_count": 134, "id": "93eceaf1-ce4c-4dfa-9c51-4fd016d09fc5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timestamp('2022-08-02 18:31:33+0000', tz='UTC')" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_campaigns_information['sent_at'].min()" ] }, { "cell_type": "code", "execution_count": 137, "id": "ea50cab4-1dae-4efe-ae3c-22b6f9ad1d26", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Timestamp('2023-11-07 10:08:16+0000', tz='UTC')" ] }, "execution_count": 137, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_campaigns_information['sent_at'].max()" ] }, { "cell_type": "code", "execution_count": 127, "id": "dcb87bc9-caf5-4655-9cfa-4a3dad504bac", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcustomer_idopened_atsent_atdelivered_atcampaign_namecampaign_service_idcampaign_sent_at
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [id, customer_id, opened_at, sent_at, delivered_at, campaign_name, campaign_service_id, campaign_sent_at]\n", "Index: []" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Filtre de la base df_campaigns_information\n", "df_campaigns_information = df_campaigns_information[(df_campaigns_information['sent_at'] <= end_features_date) & (df_campaigns_information['sent_at'] >= min_date)]\n", "df_campaigns_information" ] }, { "cell_type": "code", "execution_count": 145, "id": "abe22e09-a041-4349-be8f-b0784f2f0a98", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ticket_idcustomer_idpurchase_idevent_type_idsupplier_namepurchase_dateamountis_full_pricename_event_typesname_facilitiesname_categoriesname_eventsname_seasons
49914011083921259025.04caisse2022-02-27 13:44:10.690000+00:000.0Falseligue 1 uber eatsstade de l'aubehonneur basseolympique de marseillesaison 2021-2022
11753552731304136629.04adhésion2022-04-28 15:47:52.790000+00:000.0Falseligue 1 uber eatsstade de l'aubehonneur basseac ajacciosaison 2022-2023
274547400192140477.04adhésion2022-04-28 15:47:54.053000+00:000.0Falseligue 1 uber eatsstade de l'aubehonneur basserc strasbourgsaison 2022-2023
304844133138820259.04adhésion2021-08-03 13:45:01.603000+00:000.0Falseligue 1 uber eatsstade de l'aubevitoux hauteolympique de marseillesaison 2021-2022
311407271326590527.04web [adhésion]2022-05-26 09:15:40.993000+00:000.0Falseligue 1 uber eatsstade de l'aubechampagne bassestade brestois 29saison 2022-2023
\n", "
" ], "text/plain": [ " ticket_id customer_id purchase_id event_type_id supplier_name \\\n", "49 91401 108392 1259025.0 4 caisse \n", "117 535527 31304 136629.0 4 adhésion \n", "274 547400 192 140477.0 4 adhésion \n", "304 84413 31388 20259.0 4 adhésion \n", "311 407271 3265 90527.0 4 web [adhésion] \n", "\n", " purchase_date amount is_full_price \\\n", "49 2022-02-27 13:44:10.690000+00:00 0.0 False \n", "117 2022-04-28 15:47:52.790000+00:00 0.0 False \n", "274 2022-04-28 15:47:54.053000+00:00 0.0 False \n", "304 2021-08-03 13:45:01.603000+00:00 0.0 False \n", "311 2022-05-26 09:15:40.993000+00:00 0.0 False \n", "\n", " name_event_types name_facilities name_categories \\\n", "49 ligue 1 uber eats stade de l'aube honneur basse \n", "117 ligue 1 uber eats stade de l'aube honneur basse \n", "274 ligue 1 uber eats stade de l'aube honneur basse \n", "304 ligue 1 uber eats stade de l'aube vitoux haute \n", "311 ligue 1 uber eats stade de l'aube champagne basse \n", "\n", " name_events name_seasons \n", "49 olympique de marseille saison 2021-2022 \n", "117 ac ajaccio saison 2022-2023 \n", "274 rc strasbourg saison 2022-2023 \n", "304 olympique de marseille saison 2021-2022 \n", "311 stade brestois 29 saison 2022-2023 " ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Filtre de la base df_products_purchased_reduced\n", "df_products_purchased_reduced = df_products_purchased_reduced[(df_products_purchased_reduced['purchase_date'] <= end_features_date) & (df_products_purchased_reduced['purchase_date'] >= min_date)]\n", "df_products_purchased_reduced.head()" ] }, { "cell_type": "code", "execution_count": 150, "id": "ae7ef3a6-5b42-4a3c-a108-fec9f2ec4d32", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['caisse', 'adhésion', 'web [adhésion]', 'web [grand public]',\n", " 'itr ticketmaster', 'itr fnac', nan, 'decathlon', 'boutique web',\n", " 'boutique officielle'], dtype=object)" ] }, "execution_count": 150, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_products_purchased_reduced[\"supplier_name\"].unique()" ] }, { "cell_type": "code", "execution_count": 151, "id": "942f58a5-8ed4-4b18-a7a2-bd296447fa6a", "metadata": {}, "outputs": [], "source": [ "# KPI sur le comportement d'achat\n", "tickets_information_copy = df_products_purchased_reduced.copy()\n", "# Dummy : Canal de vente en ligne\n", "liste_mots = ['en ligne', 'internet', 'web', 'net', 'vad', 'online'] # vad = vente à distance\n", "tickets_information_copy['vente_internet'] = tickets_information_copy['supplier_name'].fillna('').str.contains('|'.join(liste_mots), case=False).astype(int)" ] }, { "cell_type": "markdown", "id": "658b57cd-4fb8-4552-a582-972144b2af1c", "metadata": {}, "source": [ "tickets_information_copy['vente_internet'] corrected by handling na" ] }, { "cell_type": "markdown", "id": "99a75c34-f393-433a-b3c2-dc3f6f2f3e7e", "metadata": {}, "source": [ "## Investigate train and test" ] }, { "cell_type": "code", "execution_count": 13, "id": "970302f5-4de2-46b4-a1ce-a5396f5330ab", "metadata": {}, "outputs": [], "source": [ "def display_databases(directory_path, file_name):\n", " \"\"\"\n", " This function returns the file from s3 storage \n", " \"\"\"\n", " file_path = \"projet-bdc2324-team1\" + \"/Generalization/\" + directory_path + \"/\" + file_name + \".csv\"\n", " print(\"File path : \", file_path)\n", " with fs.open(file_path, mode=\"rb\") as file_in:\n", " df = pd.read_csv(file_in, sep=\",\") \n", " return df " ] }, { "cell_type": "code", "execution_count": 50, "id": "f5bfae82-04aa-44e1-9869-3f4fd5736b41", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File path : projet-bdc2324-team1/Generalization/sport/Train_set.csv\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
c
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [c]\n", "Index: []" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_sport = display_databases('sport', 'Train_set')\n", "train_sport.head()" ] }, { "cell_type": "code", "execution_count": 51, "id": "56d5b12e-45e8-4312-869d-bde4d24900b6", "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'y_has_purchased'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/pandas/core/indexes/base.py:3802\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3802\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3803\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", "File \u001b[0;32mindex.pyx:153\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32mindex.pyx:182\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7081\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7089\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mKeyError\u001b[0m: 'y_has_purchased'", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[51], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtrain_sport\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43my_has_purchased\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39munique()\n", "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/pandas/core/frame.py:4090\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4088\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 4089\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 4090\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4091\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 4092\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", "File \u001b[0;32m/opt/mamba/lib/python3.11/site-packages/pandas/core/indexes/base.py:3809\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3804\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3805\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3806\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3807\u001b[0m ):\n\u001b[1;32m 3808\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3809\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3810\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3811\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3812\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3813\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3814\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", "\u001b[0;31mKeyError\u001b[0m: 'y_has_purchased'" ] } ], "source": [ "train_sport['y_has_purchased'].unique()" ] }, { "cell_type": "raw", "id": "bd8019ae-8d7b-4dfe-be93-abf80a497e13", "metadata": {}, "source": [ "projet-bdc2324-team1/Generalization/sport/Train_set/dataset_train5.csv" ] }, { "cell_type": "code", "execution_count": null, "id": "d056c7b3-0e8c-485c-b2f3-4681077f1c2e", "metadata": {}, "outputs": [], "source": [ "fs.ls('projet-bdc2324-team1/Generalization/sport')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }