diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb index d5db714..14c1b9a 100644 --- a/Exploration_billet_AJ.ipynb +++ b/Exploration_billet_AJ.ipynb @@ -103,7 +103,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15896/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_864/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } @@ -474,7 +474,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15896/232847087.py:3: SettingWithCopyWarning: \n", + "/tmp/ipykernel_864/232847087.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -756,7 +756,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 13, "id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6", "metadata": {}, "outputs": [ @@ -764,7 +764,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15896/302783287.py:3: SettingWithCopyWarning: \n", + "/tmp/ipykernel_864/302783287.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -780,7 +780,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 14, "id": "4de7e2e2-6da4-4618-8444-b524399c5493", "metadata": {}, "outputs": [ @@ -872,7 +872,7 @@ "8 7 dab" ] }, - "execution_count": 35, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1108,7 +1108,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da", "metadata": {}, "outputs": [ @@ -1116,7 +1116,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15896/81842251.py:3: SettingWithCopyWarning: \n", + "/tmp/ipykernel_864/81842251.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -1140,7 +1140,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c", "metadata": { "scrolled": true @@ -1335,7 +1335,7 @@ "[742250 rows x 7 columns]" ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1346,7 +1346,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8", "metadata": {}, "outputs": [ @@ -1377,7 +1377,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd", "metadata": {}, "outputs": [], @@ -1389,7 +1389,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "id": "27d18584-228f-4698-85d6-4d23151ea5ed", "metadata": {}, "outputs": [ @@ -1420,7 +1420,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9", "metadata": {}, "outputs": [], @@ -1439,7 +1439,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 23, "id": "e0b8b47a-b321-4a79-823c-36a131a78ac7", "metadata": {}, "outputs": [], @@ -1459,9 +1459,11 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 24, "id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -1664,7 +1666,7 @@ "[1826672 rows x 8 columns]" ] }, - "execution_count": 39, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1673,13 +1675,479 @@ "df1_ticket_information" ] }, + { + "cell_type": "markdown", + "id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75", + "metadata": {}, + "source": [ + "# Utilisation de fonctions" + ] + }, + { + "cell_type": "markdown", + "id": "27ecf058-23eb-4018-abbd-68c4ebe7c786", + "metadata": {}, + "source": [ + "## Nettoyage, selection et fusion" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "b95464b1-26bc-4aac-84b4-45da83b92251", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Fonction de nettoyage et selection\n", + "def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n", + " # Base des tickets\n", + " tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", + " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", + "\n", + " # Base des fournisseurs\n", + " suppliers = suppliers[['id', 'name']]\n", + " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", + "\n", + " # Base des types de billets\n", + " type_ofs = type_ofs[['id', 'name', 'children']]\n", + " type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n", + "\n", + " # Base des achats\n", + " # Nettoyage de la date d'achat\n", + " purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True)\n", + " purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], format = 'ISO8601')\n", + " # Selection des variables\n", + " purchases = purchases[['id', 'purchase_date', 'customer_id']]\n", + "\n", + " # Fusions \n", + " # Fusion avec fournisseurs\n", + " ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", + " ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", + " \n", + " # Fusion avec type de tickets\n", + " ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n", + " ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", + " \n", + " # Fusion avec achats\n", + " ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", + " ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n", + "\n", + " return ticket_information" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_864/2452826288.py:5: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", + "/tmp/ipykernel_864/2452826288.py:9: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", + "/tmp/ipykernel_864/2452826288.py:13: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n" + ] + } + ], + "source": [ + "df1_ticket_information_test = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "2877f3de-55d6-42d6-ad94-352d3e107862", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | ticket_id | \n", + "product_id | \n", + "is_from_subscription | \n", + "supplier_name | \n", + "type_of_ticket_name | \n", + "children | \n", + "purchase_date | \n", + "customer_id | \n", + "
---|---|---|---|---|---|---|---|---|
0 | \n", + "13070859 | \n", + "225251 | \n", + "False | \n", + "vente en ligne | \n", + "Atelier | \n", + "pricing_formula | \n", + "2018-12-28 14:47:50+00:00 | \n", + "48187 | \n", + "
1 | \n", + "13070860 | \n", + "224914 | \n", + "False | \n", + "vente en ligne | \n", + "Atelier | \n", + "pricing_formula | \n", + "2018-12-28 14:47:50+00:00 | \n", + "48187 | \n", + "
2 | \n", + "13070861 | \n", + "224914 | \n", + "False | \n", + "vente en ligne | \n", + "Atelier | \n", + "pricing_formula | \n", + "2018-12-28 14:47:50+00:00 | \n", + "48187 | \n", + "
3 | \n", + "13070862 | \n", + "224914 | \n", + "False | \n", + "vente en ligne | \n", + "Atelier | \n", + "pricing_formula | \n", + "2018-12-28 14:47:50+00:00 | \n", + "48187 | \n", + "
4 | \n", + "13070863 | \n", + "224914 | \n", + "False | \n", + "vente en ligne | \n", + "Atelier | \n", + "pricing_formula | \n", + "2018-12-28 14:47:50+00:00 | \n", + "48187 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
1826667 | \n", + "18643847 | \n", + "350454 | \n", + "False | \n", + "vad | \n", + "Billet en nombre | \n", + "pricing_formula | \n", + "2022-08-02 08:59:17+00:00 | \n", + "41 | \n", + "
1826668 | \n", + "19853111 | \n", + "383564 | \n", + "False | \n", + "vad | \n", + "Billet en nombre | \n", + "pricing_formula | \n", + "2022-11-04 14:25:42+00:00 | \n", + "62763 | \n", + "
1826669 | \n", + "19860514 | \n", + "383751 | \n", + "False | \n", + "vad | \n", + "Billet en nombre | \n", + "pricing_formula | \n", + "2022-11-18 10:47:26+00:00 | \n", + "1195566 | \n", + "
1826670 | \n", + "19860515 | \n", + "383751 | \n", + "False | \n", + "vad | \n", + "Billet en nombre | \n", + "pricing_formula | \n", + "2022-11-18 10:47:26+00:00 | \n", + "1195566 | \n", + "
1826671 | \n", + "19860516 | \n", + "383751 | \n", + "False | \n", + "vad | \n", + "Billet en nombre | \n", + "pricing_formula | \n", + "2022-11-18 10:47:26+00:00 | \n", + "1195566 | \n", + "
1826672 rows × 8 columns
\n", + "\n", + " | nb_suppliers | \n", + "label_na | \n", + "itr_na | \n", + "commission_na | \n", + "
---|---|---|---|---|
0 | \n", + "9 | \n", + "100.0 | \n", + "100.0 | \n", + "100.0 | \n", + "