From 05051824ba767848808682c6af2e6d0084f4be6b Mon Sep 17 00:00:00 2001 From: arevelle-ensae Date: Mon, 5 Feb 2024 08:02:26 +0000 Subject: [PATCH] clean products --- Notebook_AR.ipynb | 809 ++++++++++++++++++++++------------------------ 1 file changed, 387 insertions(+), 422 deletions(-) diff --git a/Notebook_AR.ipynb b/Notebook_AR.ipynb index f9fab02..9107796 100644 --- a/Notebook_AR.ipynb +++ b/Notebook_AR.ipynb @@ -624,7 +624,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_2473/2210053343.py:5: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_513/2210053343.py:5: DtypeWarning: Columns (20) have mixed types. Specify dtype option on import or set low_memory=False.\n", " customersplus = pd.read_csv(file_in, sep=\",\")\n" ] }, @@ -5485,7 +5485,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 82, "id": "43576244-c8cf-4ca0-b056-7aea1fbf0bc7", "metadata": {}, "outputs": [], @@ -5500,7 +5500,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 83, "id": "0fad097e-474c-4af7-b1e1-7d8dda3f09ea", "metadata": {}, "outputs": [], @@ -5526,7 +5526,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 84, "id": "6213b1eb-c5f8-49dd-ab69-366542380e80", "metadata": {}, "outputs": [], @@ -5536,9 +5536,15 @@ " print(\"first merge products and categories\")\n", " products = load_dataset(\"1products.csv\")\n", " categories = load_dataset(\"1categories.csv\")\n", + " # Drop useless columns\n", + " products = products.drop(columns = ['apply_price', 'extra_field', 'amount_consumption'])\n", + " categories = categories.drop(columns = ['extra_field', 'quota'])\n", + "\n", + " #Merge\n", " products_theme = products.merge(categories, how = 'left', left_on = 'category_id',\n", " right_on = 'id', suffixes=('_products', '_categories'))\n", " products_theme = products_theme.rename(columns = {\"name\" : \"name_categories\"})\n", + " \n", " # Second merge products_theme and type of categories\n", " print(\"Second merge products_theme and type of categories\")\n", " type_of_categories = load_dataset(\"1type_of_categories.csv\")\n", @@ -5550,12 +5556,14 @@ " products_theme = products_theme.drop(columns = ['id_categories'])\n", " products_theme = order_columns_id(products_theme)\n", "\n", + " \n", + "\n", " return products_theme" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 85, "id": "b853e020-f73d-44e8-b086-e5548ce21011", "metadata": {}, "outputs": [ @@ -5612,12 +5620,7 @@ " type_of_id\n", " amount\n", " is_full_price\n", - " apply_price\n", - " extra_field_products\n", - " amount_consumption\n", " name_categories\n", - " extra_field_categories\n", - " quota\n", " \n", " \n", " \n", @@ -5632,12 +5635,7 @@ " NaN\n", " 9.0\n", " False\n", - " 0.0\n", - " NaN\n", - " NaN\n", " indiv activité tr\n", - " NaN\n", - " NaN\n", " \n", " \n", " 1\n", @@ -5650,12 +5648,7 @@ " 12.0\n", " 9.5\n", " False\n", - " 0.0\n", - " NaN\n", - " NaN\n", " indiv entrées tp\n", - " NaN\n", - " NaN\n", " \n", " \n", " 2\n", @@ -5668,12 +5661,7 @@ " 12.0\n", " 11.5\n", " False\n", - " 0.0\n", - " NaN\n", - " NaN\n", " indiv entrées tp\n", - " NaN\n", - " NaN\n", " \n", " \n", " 3\n", @@ -5686,12 +5674,7 @@ " NaN\n", " 8.0\n", " False\n", - " 0.0\n", - " NaN\n", - " NaN\n", " indiv entrées tr\n", - " NaN\n", - " NaN\n", " \n", " \n", " 4\n", @@ -5704,12 +5687,7 @@ " 12.0\n", " 8.5\n", " False\n", - " 0.0\n", - " NaN\n", - " NaN\n", " indiv entrées tp\n", - " NaN\n", - " NaN\n", " \n", " \n", "\n", @@ -5730,22 +5708,15 @@ "3 156773 1 NaN 8.0 False \n", "4 1175 1 12.0 8.5 False \n", "\n", - " apply_price extra_field_products amount_consumption name_categories \\\n", - "0 0.0 NaN NaN indiv activité tr \n", - "1 0.0 NaN NaN indiv entrées tp \n", - "2 0.0 NaN NaN indiv entrées tp \n", - "3 0.0 NaN NaN indiv entrées tr \n", - "4 0.0 NaN NaN indiv entrées tp \n", - "\n", - " extra_field_categories quota \n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN " + " name_categories \n", + "0 indiv activité tr \n", + "1 indiv entrées tp \n", + "2 indiv entrées tp \n", + "3 indiv entrées tr \n", + "4 indiv entrées tp " ] }, - "execution_count": 65, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } @@ -5765,7 +5736,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 86, "id": "6ed0ad20-8315-4112-9a85-10e5f04ef852", "metadata": {}, "outputs": [], @@ -5775,12 +5746,18 @@ " print(\"first merge events and seasons : \")\n", " events = load_dataset(\"1events.csv\")\n", " seasons = load_dataset(\"1seasons.csv\")\n", + "\n", + " # Drop useless columns\n", + " events = events.drop(columns = ['manual_added', 'is_display'])\n", + " seasons = seasons.drop(columns = ['start_date_time'])\n", + " \n", " events_theme = events.merge(seasons, how = 'left', left_on = 'season_id', right_on = 'id', suffixes=('_events', '_seasons'))\n", "\n", " # Secondly merge events_theme and event_types\n", " print(\"Secondly merge events_theme and event_types : \")\n", " event_types = load_dataset(\"1event_types.csv\")\n", - "\n", + " event_types = event_types.drop(columns = ['fidelity_delay'])\n", + " \n", " events_theme = events_theme.merge(event_types, how = 'left', left_on = 'event_type_id', right_on = 'id', suffixes=('_events', '_event_type'))\n", " events_theme = events_theme.rename(columns = {\"name\" : \"name_event_types\"})\n", " events_theme = events_theme.drop(columns = 'id')\n", @@ -5788,8 +5765,10 @@ " # thirdly merge events_theme and facilities\n", " print(\"thirdly merge events_theme and facilities : \")\n", " facilities = load_dataset(\"1facilities.csv\")\n", + " facilities = facilities.drop(columns = ['fixed_capacity'])\n", + " \n", " events_theme = events_theme.merge(facilities, how = 'left', left_on = 'facility_id', right_on = 'id', suffixes=('_events', '_facility'))\n", - " events_theme = events_theme.rename(columns = {\"name\" : \"name_facilties\", \"id_events\" : \"event_id\"})\n", + " events_theme = events_theme.rename(columns = {\"name\" : \"name_facilities\", \"id_events\" : \"event_id\"})\n", " events_theme = events_theme.drop(columns = 'id')\n", "\n", " # Index cleaning\n", @@ -5800,7 +5779,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 87, "id": "98ef0636-8c45-4a23-a62a-1fbe1544f8ce", "metadata": {}, "outputs": [ @@ -5858,16 +5837,11 @@ " event_type_id\n", " event_type_key_id\n", " facility_key_id\n", - " fidelity_delay\n", " street_id\n", " name_events\n", - " manual_added\n", - " is_display\n", " name_seasons\n", - " start_date_time\n", " name_event_types\n", - " name_facilties\n", - " fixed_capacity\n", + " name_facilities\n", " \n", " \n", " \n", @@ -5879,16 +5853,11 @@ " 4\n", " 4\n", " 1\n", - " 36\n", " 1\n", " frontières\n", - " False\n", - " True\n", " 2018\n", - " NaN\n", " spectacle vivant\n", " mucem\n", - " NaN\n", " \n", " \n", " 1\n", @@ -5898,16 +5867,11 @@ " 5\n", " 5\n", " 1\n", - " 36\n", " 1\n", " visite guidée une autre histoire du monde (1h00)\n", - " False\n", - " True\n", " 2023\n", - " NaN\n", " offre muséale groupe\n", " mucem\n", - " NaN\n", " \n", " \n", " 2\n", @@ -5917,16 +5881,11 @@ " 2\n", " 2\n", " 1\n", - " 36\n", " 1\n", " visite contée les chercheurs d'or indiv\n", - " False\n", - " True\n", " 2018\n", - " NaN\n", " offre muséale individuel\n", " mucem\n", - " NaN\n", " \n", " \n", " 3\n", @@ -5936,16 +5895,11 @@ " 4\n", " 4\n", " 1\n", - " 36\n", " 1\n", " we dreamt of utopia and we woke up screaming.\n", - " False\n", - " True\n", " 2021\n", - " NaN\n", " spectacle vivant\n", " mucem\n", - " NaN\n", " \n", " \n", " 4\n", @@ -5955,16 +5909,11 @@ " 4\n", " 4\n", " 1\n", - " 36\n", " 1\n", " jeff koons épisodes 4\n", - " False\n", - " True\n", " 2021\n", - " NaN\n", " spectacle vivant\n", " mucem\n", - " NaN\n", " \n", " \n", "\n", @@ -5978,36 +5927,29 @@ "3 5957 582 1 4 4 \n", "4 8337 582 1 4 4 \n", "\n", - " facility_key_id fidelity_delay street_id \\\n", - "0 1 36 1 \n", - "1 1 36 1 \n", - "2 1 36 1 \n", - "3 1 36 1 \n", - "4 1 36 1 \n", + " facility_key_id street_id \\\n", + "0 1 1 \n", + "1 1 1 \n", + "2 1 1 \n", + "3 1 1 \n", + "4 1 1 \n", "\n", - " name_events manual_added is_display \\\n", - "0 frontières False True \n", - "1 visite guidée une autre histoire du monde (1h00) False True \n", - "2 visite contée les chercheurs d'or indiv False True \n", - "3 we dreamt of utopia and we woke up screaming. False True \n", - "4 jeff koons épisodes 4 False True \n", + " name_events name_seasons \\\n", + "0 frontières 2018 \n", + "1 visite guidée une autre histoire du monde (1h00) 2023 \n", + "2 visite contée les chercheurs d'or indiv 2018 \n", + "3 we dreamt of utopia and we woke up screaming. 2021 \n", + "4 jeff koons épisodes 4 2021 \n", "\n", - " name_seasons start_date_time name_event_types name_facilties \\\n", - "0 2018 NaN spectacle vivant mucem \n", - "1 2023 NaN offre muséale groupe mucem \n", - "2 2018 NaN offre muséale individuel mucem \n", - "3 2021 NaN spectacle vivant mucem \n", - "4 2021 NaN spectacle vivant mucem \n", - "\n", - " fixed_capacity \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " + " name_event_types name_facilities \n", + "0 spectacle vivant mucem \n", + "1 offre muséale groupe mucem \n", + "2 offre muséale individuel mucem \n", + "3 spectacle vivant mucem \n", + "4 spectacle vivant mucem " ] }, - "execution_count": 104, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -6027,14 +5969,19 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 96, "id": "481dddd6-80a8-4b9e-a05e-ed06fa3ed7a6", "metadata": {}, "outputs": [], "source": [ "def create_representations_table():\n", " representations = load_dataset(\"1representations.csv\")\n", + " representations = representations.drop(columns = ['serial', 'open', 'satisfaction', 'is_display', 'expected_filling',\n", + " 'max_filling', 'extra_field', 'start_date_time', 'end_date_time', 'name',\n", + " 'representation_type_id'])\n", + " \n", " representations_capacity = load_dataset(\"1representation_category_capacities.csv\")\n", + " representations_capacity = representations_capacity.drop(columns = ['expected_filling', 'max_filling'])\n", "\n", " representations_theme = representations.merge(representations_capacity, how='left',\n", " left_on='id', right_on='representation_id',\n", @@ -6047,7 +5994,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 97, "id": "677f4ed8-ef58-45f2-9056-ede0898c6a64", "metadata": {}, "outputs": [ @@ -6092,167 +6039,61 @@ " \n", " \n", " event_id\n", - " representation_type_id\n", " id_representation_cap\n", " representation_id\n", " category_id\n", - " serial\n", - " start_date_time\n", - " open\n", - " satisfaction\n", - " end_date_time\n", - " name\n", - " is_display\n", - " expected_filling_representation\n", - " max_filling_representation\n", - " extra_field\n", - " expected_filling_representation_cap\n", - " max_filling_representation_cap\n", " \n", " \n", " \n", " \n", " 0\n", " 12384\n", - " NaN\n", " 123058\n", " 84820\n", " 2\n", - " NaN\n", - " 2018-09-26 15:15:00+02:00\n", - " True\n", - " NaN\n", - " 1901-01-01 00:09:21+00:09\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", " \n", " \n", " 1\n", " 37\n", - " NaN\n", " 2514\n", " 269\n", " 2\n", - " NaN\n", - " 2016-04-27 17:00:00+02:00\n", - " True\n", - " NaN\n", - " 2016-04-27 18:00:00+02:00\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", " \n", " \n", " 2\n", " 37\n", - " NaN\n", " 384\n", " 269\n", " 5\n", - " NaN\n", - " 2016-04-27 17:00:00+02:00\n", - " True\n", - " NaN\n", - " 2016-04-27 18:00:00+02:00\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", " \n", " \n", " 3\n", " 37\n", - " NaN\n", " 2515\n", " 269\n", " 10\n", - " NaN\n", - " 2016-04-27 17:00:00+02:00\n", - " True\n", - " NaN\n", - " 2016-04-27 18:00:00+02:00\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", " \n", " \n", " 4\n", " 37\n", - " NaN\n", " 383\n", " 269\n", " 1\n", - " NaN\n", - " 2016-04-27 17:00:00+02:00\n", - " True\n", - " NaN\n", - " 2016-04-27 18:00:00+02:00\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", " \n", " \n", "\n", "" ], "text/plain": [ - " event_id representation_type_id id_representation_cap representation_id \\\n", - "0 12384 NaN 123058 84820 \n", - "1 37 NaN 2514 269 \n", - "2 37 NaN 384 269 \n", - "3 37 NaN 2515 269 \n", - "4 37 NaN 383 269 \n", - "\n", - " category_id serial start_date_time open satisfaction \\\n", - "0 2 NaN 2018-09-26 15:15:00+02:00 True NaN \n", - "1 2 NaN 2016-04-27 17:00:00+02:00 True NaN \n", - "2 5 NaN 2016-04-27 17:00:00+02:00 True NaN \n", - "3 10 NaN 2016-04-27 17:00:00+02:00 True NaN \n", - "4 1 NaN 2016-04-27 17:00:00+02:00 True NaN \n", - "\n", - " end_date_time name is_display \\\n", - "0 1901-01-01 00:09:21+00:09 NaN True \n", - "1 2016-04-27 18:00:00+02:00 NaN True \n", - "2 2016-04-27 18:00:00+02:00 NaN True \n", - "3 2016-04-27 18:00:00+02:00 NaN True \n", - "4 2016-04-27 18:00:00+02:00 NaN True \n", - "\n", - " expected_filling_representation max_filling_representation extra_field \\\n", - "0 NaN NaN NaN \n", - "1 NaN NaN NaN \n", - "2 NaN NaN NaN \n", - "3 NaN NaN NaN \n", - "4 NaN NaN NaN \n", - "\n", - " expected_filling_representation_cap max_filling_representation_cap \n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN " + " event_id id_representation_cap representation_id category_id\n", + "0 12384 123058 84820 2\n", + "1 37 2514 269 2\n", + "2 37 384 269 5\n", + "3 37 2515 269 10\n", + "4 37 383 269 1" ] }, - "execution_count": 106, + "execution_count": 97, "metadata": {}, "output_type": "execute_result" } @@ -6272,7 +6113,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 98, "id": "b26f4e7e-134d-4e32-a615-4b0e6bb80b25", "metadata": {}, "outputs": [ @@ -6282,24 +6123,16 @@ "text": [ "Products theme columns : Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n", " 'products_group_id', 'product_pack_id', 'type_of_id', 'amount',\n", - " 'is_full_price', 'apply_price', 'extra_field_products',\n", - " 'amount_consumption', 'name_categories', 'extra_field_categories',\n", - " 'quota'],\n", + " 'is_full_price', 'name_categories'],\n", " dtype='object')\n", "\n", - " Representation theme columns : Index(['event_id', 'representation_type_id', 'id_representation_cap',\n", - " 'representation_id', 'category_id', 'serial', 'start_date_time', 'open',\n", - " 'satisfaction', 'end_date_time', 'name', 'is_display',\n", - " 'expected_filling_representation', 'max_filling_representation',\n", - " 'extra_field', 'expected_filling_representation_cap',\n", - " 'max_filling_representation_cap'],\n", + " Representation theme columns : Index(['event_id', 'id_representation_cap', 'representation_id',\n", + " 'category_id'],\n", " dtype='object')\n", "\n", " Events theme columns : Index(['event_id', 'season_id', 'facility_id', 'event_type_id',\n", - " 'event_type_key_id', 'facility_key_id', 'fidelity_delay', 'street_id',\n", - " 'name_events', 'manual_added', 'is_display', 'name_seasons',\n", - " 'start_date_time', 'name_event_types', 'name_facilties',\n", - " 'fixed_capacity'],\n", + " 'event_type_key_id', 'facility_key_id', 'street_id', 'name_events',\n", + " 'name_seasons', 'name_event_types', 'name_facilities'],\n", " dtype='object')\n" ] } @@ -6312,7 +6145,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 99, "id": "d40b1e3b-b1f3-4915-8ebc-6bb7856da42a", "metadata": {}, "outputs": [ @@ -6346,18 +6179,9 @@ " type_of_id\n", " amount\n", " is_full_price\n", - " apply_price\n", - " ...\n", - " open\n", - " satisfaction\n", - " end_date_time\n", - " name\n", - " is_display\n", - " expected_filling_representation\n", - " max_filling_representation\n", - " extra_field\n", - " expected_filling_representation_cap\n", - " max_filling_representation_cap\n", + " name_categories\n", + " event_id\n", + " id_representation_cap\n", " \n", " \n", " \n", @@ -6372,18 +6196,9 @@ " NaN\n", " 9.0\n", " False\n", - " 0.0\n", - " ...\n", - " True\n", - " NaN\n", - " 2017-11-19 16:30:00+01:00\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " indiv activité tr\n", + " 132\n", + " 8789\n", " \n", " \n", " 1\n", @@ -6396,18 +6211,9 @@ " 12.0\n", " 9.5\n", " False\n", - " 0.0\n", - " ...\n", - " True\n", - " NaN\n", - " 2016-04-28 16:00:00+02:00\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " indiv entrées tp\n", + " 37\n", + " 390\n", " \n", " \n", " 2\n", @@ -6420,18 +6226,9 @@ " 12.0\n", " 11.5\n", " False\n", - " 0.0\n", - " ...\n", - " True\n", - " NaN\n", - " 2016-04-28 14:00:00+02:00\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " indiv entrées tp\n", + " 37\n", + " 395\n", " \n", " \n", " 3\n", @@ -6444,18 +6241,9 @@ " NaN\n", " 8.0\n", " False\n", - " 0.0\n", - " ...\n", - " True\n", - " NaN\n", - " 1901-01-01 00:09:21+00:09\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " indiv entrées tr\n", + " 12365\n", + " 120199\n", " \n", " \n", " 4\n", @@ -6468,22 +6256,12 @@ " 12.0\n", " 8.5\n", " False\n", - " 0.0\n", - " ...\n", - " True\n", - " NaN\n", - " 1901-01-01 00:09:21+00:09\n", - " NaN\n", - " True\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " indiv entrées tp\n", + " 8\n", + " 21\n", " \n", " \n", "\n", - "

5 rows × 30 columns

\n", "" ], "text/plain": [ @@ -6501,38 +6279,15 @@ "3 156773 1 NaN 8.0 False \n", "4 1175 1 12.0 8.5 False \n", "\n", - " apply_price ... open satisfaction end_date_time name \\\n", - "0 0.0 ... True NaN 2017-11-19 16:30:00+01:00 NaN \n", - "1 0.0 ... True NaN 2016-04-28 16:00:00+02:00 NaN \n", - "2 0.0 ... True NaN 2016-04-28 14:00:00+02:00 NaN \n", - "3 0.0 ... True NaN 1901-01-01 00:09:21+00:09 NaN \n", - "4 0.0 ... True NaN 1901-01-01 00:09:21+00:09 NaN \n", - "\n", - " is_display expected_filling_representation max_filling_representation \\\n", - "0 True NaN NaN \n", - "1 True NaN NaN \n", - "2 True NaN NaN \n", - "3 True NaN NaN \n", - "4 True NaN NaN \n", - "\n", - " extra_field expected_filling_representation_cap \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", - "\n", - " max_filling_representation_cap \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - "\n", - "[5 rows x 30 columns]" + " name_categories event_id id_representation_cap \n", + "0 indiv activité tr 132 8789 \n", + "1 indiv entrées tp 37 390 \n", + "2 indiv entrées tp 37 395 \n", + "3 indiv entrées tr 12365 120199 \n", + "4 indiv entrées tp 8 21 " ] }, - "execution_count": 115, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -6547,7 +6302,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 100, "id": "78d75a08-e959-429c-847a-7d70a2804806", "metadata": {}, "outputs": [ @@ -6580,19 +6335,19 @@ " product_pack_id\n", " type_of_id\n", " event_id\n", - " representation_type_id\n", " id_representation_cap\n", + " season_id\n", " ...\n", - " expected_filling_representation_cap\n", - " max_filling_representation_cap\n", + " event_type_key_id\n", + " facility_key_id\n", + " street_id\n", + " amount\n", + " is_full_price\n", + " name_categories\n", " name_events\n", - " manual_added\n", - " is_display_event\n", " name_seasons\n", - " start_date_time_event\n", " name_event_types\n", - " name_facilties\n", - " fixed_capacity\n", + " name_facilities\n", " \n", " \n", " \n", @@ -6606,19 +6361,19 @@ " 1\n", " NaN\n", " 132\n", - " NaN\n", " 8789\n", + " 4\n", " ...\n", - " NaN\n", - " NaN\n", - " visite-jeu \"le classico des minots\" (1h30)\n", + " 5\n", + " 1\n", + " 1\n", + " 9.0\n", " False\n", - " True\n", + " indiv activité tr\n", + " visite-jeu \"le classico des minots\" (1h30)\n", " 2017\n", - " NaN\n", " offre muséale individuel\n", " mucem\n", - " NaN\n", " \n", " \n", " 1\n", @@ -6630,19 +6385,19 @@ " 1\n", " 12.0\n", " 37\n", - " NaN\n", " 390\n", + " 2\n", " ...\n", - " NaN\n", - " NaN\n", - " billet mucem picasso\n", + " 2\n", + " 1\n", + " 1\n", + " 9.5\n", " False\n", - " True\n", + " indiv entrées tp\n", + " billet mucem picasso\n", " 2016\n", - " NaN\n", " offre muséale individuel\n", " mucem\n", - " NaN\n", " \n", " \n", " 2\n", @@ -6654,19 +6409,19 @@ " 1\n", " 12.0\n", " 37\n", - " NaN\n", " 395\n", + " 2\n", " ...\n", - " NaN\n", - " NaN\n", - " billet mucem picasso\n", + " 2\n", + " 1\n", + " 1\n", + " 11.5\n", " False\n", - " True\n", + " indiv entrées tp\n", + " billet mucem picasso\n", " 2016\n", - " NaN\n", " offre muséale individuel\n", " mucem\n", - " NaN\n", " \n", " \n", " 3\n", @@ -6678,19 +6433,19 @@ " 1\n", " NaN\n", " 12365\n", - " NaN\n", " 120199\n", + " 1754\n", " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " False\n", + " 4\n", + " 1\n", + " 1\n", + " 8.0\n", " False\n", + " indiv entrées tr\n", " NaN\n", " NaN\n", " offre muséale individuel\n", " mucem\n", - " NaN\n", " \n", " \n", " 4\n", @@ -6702,23 +6457,23 @@ " 1\n", " 12.0\n", " 8\n", - " NaN\n", " 21\n", + " 4\n", " ...\n", - " NaN\n", - " NaN\n", - " non défini\n", + " 6\n", + " 1\n", + " 1\n", + " 8.5\n", " False\n", - " True\n", + " indiv entrées tp\n", + " non défini\n", " 2017\n", - " NaN\n", " non défini\n", " mucem\n", - " NaN\n", " \n", " \n", "\n", - "

5 rows × 45 columns

\n", + "

5 rows × 22 columns

\n", "" ], "text/plain": [ @@ -6736,45 +6491,38 @@ "3 156773 1 NaN 12365 \n", "4 1175 1 12.0 8 \n", "\n", - " representation_type_id id_representation_cap ... \\\n", - "0 NaN 8789 ... \n", - "1 NaN 390 ... \n", - "2 NaN 395 ... \n", - "3 NaN 120199 ... \n", - "4 NaN 21 ... \n", + " id_representation_cap season_id ... event_type_key_id facility_key_id \\\n", + "0 8789 4 ... 5 1 \n", + "1 390 2 ... 2 1 \n", + "2 395 2 ... 2 1 \n", + "3 120199 1754 ... 4 1 \n", + "4 21 4 ... 6 1 \n", "\n", - " expected_filling_representation_cap max_filling_representation_cap \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", + " street_id amount is_full_price name_categories \\\n", + "0 1 9.0 False indiv activité tr \n", + "1 1 9.5 False indiv entrées tp \n", + "2 1 11.5 False indiv entrées tp \n", + "3 1 8.0 False indiv entrées tr \n", + "4 1 8.5 False indiv entrées tp \n", "\n", - " name_events manual_added is_display_event \\\n", - "0 visite-jeu \"le classico des minots\" (1h30) False True \n", - "1 billet mucem picasso False True \n", - "2 billet mucem picasso False True \n", - "3 NaN False False \n", - "4 non défini False True \n", + " name_events name_seasons \\\n", + "0 visite-jeu \"le classico des minots\" (1h30) 2017 \n", + "1 billet mucem picasso 2016 \n", + "2 billet mucem picasso 2016 \n", + "3 NaN NaN \n", + "4 non défini 2017 \n", "\n", - " name_seasons start_date_time_event name_event_types \\\n", - "0 2017 NaN offre muséale individuel \n", - "1 2016 NaN offre muséale individuel \n", - "2 2016 NaN offre muséale individuel \n", - "3 NaN NaN offre muséale individuel \n", - "4 2017 NaN non défini \n", + " name_event_types name_facilities \n", + "0 offre muséale individuel mucem \n", + "1 offre muséale individuel mucem \n", + "2 offre muséale individuel mucem \n", + "3 offre muséale individuel mucem \n", + "4 non défini mucem \n", "\n", - " name_facilties fixed_capacity \n", - "0 mucem NaN \n", - "1 mucem NaN \n", - "2 mucem NaN \n", - "3 mucem NaN \n", - "4 mucem NaN \n", - "\n", - "[5 rows x 45 columns]" + "[5 rows x 22 columns]" ] }, - "execution_count": 116, + "execution_count": 100, "metadata": {}, "output_type": "execute_result" } @@ -6788,11 +6536,228 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 101, + "id": "4a6950e8-4818-4df2-afa9-562e0921698c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['id_products', 'representation_id', 'pricing_formula_id', 'category_id',\n", + " 'products_group_id', 'product_pack_id', 'type_of_id', 'event_id',\n", + " 'id_representation_cap', 'season_id', 'facility_id', 'event_type_id',\n", + " 'event_type_key_id', 'facility_key_id', 'street_id', 'amount',\n", + " 'is_full_price', 'name_categories', 'name_events', 'name_seasons',\n", + " 'name_event_types', 'name_facilities'],\n", + " dtype='object')" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "products_global.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 102, "id": "b18f6428-90e0-4b1b-9b8d-bad995fb6c98", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(94803, 22)" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "products_global.shape" + ] + }, + { + "cell_type": "markdown", + "id": "c3caf2fd-178e-48e9-b95f-5798bd576f5d", + "metadata": {}, + "source": [ + "## Analysis of Products_global" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "33ee07a2-d871-4436-9860-9be389bc4902", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id_products 0\n", + "representation_id 0\n", + "pricing_formula_id 0\n", + "category_id 0\n", + "products_group_id 0\n", + "product_pack_id 0\n", + "type_of_id 67589\n", + "event_id 0\n", + "id_representation_cap 0\n", + "season_id 0\n", + "facility_id 0\n", + "event_type_id 0\n", + "event_type_key_id 0\n", + "facility_key_id 0\n", + "street_id 0\n", + "amount 0\n", + "is_full_price 0\n", + "name_categories 3991\n", + "name_events 46657\n", + "name_seasons 30663\n", + "name_event_types 0\n", + "name_facilities 0\n", + "dtype: int64" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "products_global.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "557fc475-4417-4d9f-8d4e-8c49bc42367f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['offre muséale individuel', 'non défini', 'spectacle vivant',\n", + " 'offre muséale groupe', 'formule adhésion'], dtype=object)" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many event types ?\n", + "\n", + "products_global['name_event_types'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "a9b9a23c-b0de-4685-97e5-d52dd78349f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "644" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many events ?\n", + "\n", + "len(products_global['name_events'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "fb374c72-58ca-404d-a86b-e834a2fc4a34", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['indiv activité tr', 'indiv entrées tp', 'indiv entrées tr',\n", + " 'indiv prog enfant', 'indiv activité gr', 'indiv prog gr',\n", + " 'indiv activité tp', 'indiv activité enfant', 'indiv entrées gr',\n", + " 'groupe forfait entrées tr', 'groupe autonome adulte',\n", + " 'indiv prog tp', 'indiv prog tr', 'indiv entrées fa',\n", + " 'groupe forfait scolaire', 'en nb entrées tr', 'non défini', nan,\n", + " 'en nb entrées gr', 'groupe autonome entrées gr',\n", + " 'groupe forfait entrées gr', 'groupe autonome entrées tr',\n", + " 'en nb entrées tp', 'groupe autonome gr',\n", + " 'groupe autonome entrées tp', 'groupe forfait adulte',\n", + " 'groupe forfait etudiant'], dtype=object)" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# how many categories ?\n", + "products_global['name_categories'].unique()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "11f89771-8d50-4ef4-b34e-53e4f6b419bb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "27" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(products_global['category_id'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8add1ff2-b7e8-4381-90d8-d18d8660ed39", + "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def uniform_product_df():\n", + " print(\"Products theme columns : \", products_theme.columns)\n", + " print(\"\\n Representation theme columns : \", representation_theme.columns)\n", + " print(\"\\n Events theme columns : \", events_theme.columns)\n", + "\n", + " products_global = products_theme.merge(representation_theme, how='left',\n", + " on= [\"representation_id\", \"category_id\"])\n", + " \n", + " products_global = products_global.merge(events_theme, how='left', on='event_id',\n", + " suffixes = (\"_representation\", \"_event\"))\n", + " \n", + " products_global = order_columns_id(products_global)\n", + "\n", + " # remove useless columns \n", + " products_global = products_global.drop(columns = ['type_of_id', 'name_events', 'name_seasons', 'name_categories'])\n", + " return products_global\n", + " " + ] } ], "metadata": {