diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb index d5db714..14c1b9a 100644 --- a/Exploration_billet_AJ.ipynb +++ b/Exploration_billet_AJ.ipynb @@ -103,7 +103,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15896/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "/tmp/ipykernel_864/4135596479.py:10: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df = pd.read_csv(file_in)\n" ] } @@ -474,7 +474,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15896/232847087.py:3: SettingWithCopyWarning: \n", + "/tmp/ipykernel_864/232847087.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -756,7 +756,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 13, "id": "3c645ab7-16bf-4054-9ae2-15a8c32e29c6", "metadata": {}, "outputs": [ @@ -764,7 +764,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15896/302783287.py:3: SettingWithCopyWarning: \n", + "/tmp/ipykernel_864/302783287.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -780,7 +780,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 14, "id": "4de7e2e2-6da4-4618-8444-b524399c5493", "metadata": {}, "outputs": [ @@ -872,7 +872,7 @@ "8 7 dab" ] }, - "execution_count": 35, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1108,7 +1108,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "id": "cbb5e614-1fe5-4da0-bca0-8a242e0885da", "metadata": {}, "outputs": [ @@ -1116,7 +1116,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_15896/81842251.py:3: SettingWithCopyWarning: \n", + "/tmp/ipykernel_864/81842251.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", @@ -1140,7 +1140,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "id": "f8d36b72-f8e7-45e5-b4fa-e0803493fd3c", "metadata": { "scrolled": true @@ -1335,7 +1335,7 @@ "[742250 rows x 7 columns]" ] }, - "execution_count": 20, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1346,7 +1346,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "id": "3f266a9d-6eee-4b27-b6cc-d401bc2fa0b8", "metadata": {}, "outputs": [ @@ -1377,7 +1377,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "id": "8b24ccbc-ccf0-4722-8cd9-8ee8aa90d1fd", "metadata": {}, "outputs": [], @@ -1389,7 +1389,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "id": "27d18584-228f-4698-85d6-4d23151ea5ed", "metadata": {}, "outputs": [ @@ -1420,7 +1420,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "id": "ea22e3a2-2b25-481d-8ebc-194e11a06cd9", "metadata": {}, "outputs": [], @@ -1439,7 +1439,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 23, "id": "e0b8b47a-b321-4a79-823c-36a131a78ac7", "metadata": {}, "outputs": [], @@ -1459,9 +1459,11 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 24, "id": "7572e6e7-f28d-43ba-b045-b9fa09e68e1d", - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -1664,7 +1666,7 @@ "[1826672 rows x 8 columns]" ] }, - "execution_count": 39, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1673,13 +1675,479 @@ "df1_ticket_information" ] }, + { + "cell_type": "markdown", + "id": "ad2d0059-76d3-44b9-b0eb-0b0ca4d4ba75", + "metadata": {}, + "source": [ + "# Utilisation de fonctions" + ] + }, + { + "cell_type": "markdown", + "id": "27ecf058-23eb-4018-abbd-68c4ebe7c786", + "metadata": {}, + "source": [ + "## Nettoyage, selection et fusion" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "b95464b1-26bc-4aac-84b4-45da83b92251", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Fonction de nettoyage et selection\n", + "def preprocessing_tickets_area(tickets = None, purchases = None, suppliers = None, type_ofs = None):\n", + " # Base des tickets\n", + " tickets = tickets[['id', 'purchase_id', 'product_id', 'is_from_subscription', 'type_of', 'supplier_id']]\n", + " tickets.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", + "\n", + " # Base des fournisseurs\n", + " suppliers = suppliers[['id', 'name']]\n", + " suppliers.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", + "\n", + " # Base des types de billets\n", + " type_ofs = type_ofs[['id', 'name', 'children']]\n", + " type_ofs.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n", + "\n", + " # Base des achats\n", + " # Nettoyage de la date d'achat\n", + " purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], utc = True)\n", + " purchases['purchase_date'] = pd.to_datetime(purchases['purchase_date'], format = 'ISO8601')\n", + " # Selection des variables\n", + " purchases = purchases[['id', 'purchase_date', 'customer_id']]\n", + "\n", + " # Fusions \n", + " # Fusion avec fournisseurs\n", + " ticket_information = pd.merge(tickets, suppliers, left_on = 'supplier_id', right_on = 'id', how = 'inner')\n", + " ticket_information.drop(['supplier_id', 'id'], axis = 1, inplace=True)\n", + " \n", + " # Fusion avec type de tickets\n", + " ticket_information = pd.merge(ticket_information, type_ofs, left_on = 'type_of', right_on = 'id', how = 'inner')\n", + " ticket_information.drop(['type_of', 'id'], axis = 1, inplace=True)\n", + " \n", + " # Fusion avec achats\n", + " ticket_information = pd.merge(ticket_information, purchases, left_on = 'purchase_id', right_on = 'id', how = 'inner')\n", + " ticket_information.drop(['purchase_id', 'id'], axis = 1, inplace=True)\n", + "\n", + " return ticket_information" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "3e1d2ba7-ff4f-48eb-93a8-2bb648c70396", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_864/2452826288.py:5: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " tickets_clean.rename(columns = {'id' : 'ticket_id'}, inplace = True)\n", + "/tmp/ipykernel_864/2452826288.py:9: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " suppliers_clean.rename(columns = {'name' : 'supplier_name'}, inplace = True)\n", + "/tmp/ipykernel_864/2452826288.py:13: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " type_ofs_clean.rename(columns = {'name' : 'type_of_ticket_name'}, inplace = True)\n" + ] + } + ], + "source": [ + "df1_ticket_information_test = preprocessing_tickets_area(tickets = df1_tickets, purchases = df1_purchases, suppliers = df1_suppliers, type_ofs = df1_type_ofs)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "2877f3de-55d6-42d6-ad94-352d3e107862", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ticket_idproduct_idis_from_subscriptionsupplier_nametype_of_ticket_namechildrenpurchase_datecustomer_id
013070859225251Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
113070860224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
213070861224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
313070862224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
413070863224914Falsevente en ligneAtelierpricing_formula2018-12-28 14:47:50+00:0048187
...........................
182666718643847350454FalsevadBillet en nombrepricing_formula2022-08-02 08:59:17+00:0041
182666819853111383564FalsevadBillet en nombrepricing_formula2022-11-04 14:25:42+00:0062763
182666919860514383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
182667019860515383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
182667119860516383751FalsevadBillet en nombrepricing_formula2022-11-18 10:47:26+00:001195566
\n", + "

1826672 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " ticket_id product_id is_from_subscription supplier_name \\\n", + "0 13070859 225251 False vente en ligne \n", + "1 13070860 224914 False vente en ligne \n", + "2 13070861 224914 False vente en ligne \n", + "3 13070862 224914 False vente en ligne \n", + "4 13070863 224914 False vente en ligne \n", + "... ... ... ... ... \n", + "1826667 18643847 350454 False vad \n", + "1826668 19853111 383564 False vad \n", + "1826669 19860514 383751 False vad \n", + "1826670 19860515 383751 False vad \n", + "1826671 19860516 383751 False vad \n", + "\n", + " type_of_ticket_name children purchase_date \\\n", + "0 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "1 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "2 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "3 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "4 Atelier pricing_formula 2018-12-28 14:47:50+00:00 \n", + "... ... ... ... \n", + "1826667 Billet en nombre pricing_formula 2022-08-02 08:59:17+00:00 \n", + "1826668 Billet en nombre pricing_formula 2022-11-04 14:25:42+00:00 \n", + "1826669 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", + "1826670 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", + "1826671 Billet en nombre pricing_formula 2022-11-18 10:47:26+00:00 \n", + "\n", + " customer_id \n", + "0 48187 \n", + "1 48187 \n", + "2 48187 \n", + "3 48187 \n", + "4 48187 \n", + "... ... \n", + "1826667 41 \n", + "1826668 62763 \n", + "1826669 1195566 \n", + "1826670 1195566 \n", + "1826671 1195566 \n", + "\n", + "[1826672 rows x 8 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_ticket_information" + ] + }, + { + "cell_type": "markdown", + "id": "96ea2523-38dc-47ef-a49e-2c2d9ad0b1c6", + "metadata": {}, + "source": [ + "## Exploration variables" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "aaa41688-ea7e-4dba-851c-1f0b0ec43c71", + "metadata": {}, + "outputs": [], + "source": [ + "# Fonction d'exploration pour suppliers.csv = label itr et commission inconnues\n", + "def suppliers_exploration(suppliers = None) : \n", + " \n", + " # Taux de NaN pour ces colonnes\n", + " label_na = suppliers['label'].isna().sum()/len(suppliers)*100\n", + " itr_na = suppliers['itr'].isna().sum()/len(suppliers)*100\n", + " commission_na = suppliers['commission'].isna().sum()/len(suppliers)*100\n", + "\n", + " suppliers_desc = pd.DataFrame({'nb_suppliers' : [suppliers['name'].nunique()],\n", + " 'label_na' : [label_na],\n", + " 'itr_na' : [itr_na],\n", + " 'commission_na' : [commission_na]})\n", + "\n", + " return suppliers_desc" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "2fecc2e1-113f-46ed-9065-0b9ee416166e", + "metadata": {}, + "outputs": [], + "source": [ + "df1_suppliers_desc = suppliers_exploration(suppliers = df1_suppliers)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "55f6170a-36fb-4efb-9810-f982883660cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nb_supplierslabel_naitr_nacommission_na
09100.0100.0100.0
\n", + "
" + ], + "text/plain": [ + " nb_suppliers label_na itr_na commission_na\n", + "0 9 100.0 100.0 100.0" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_suppliers_desc" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "0030fd02-09e3-42f5-9c83-290458a38c29", + "metadata": {}, + "outputs": [], + "source": [ + "BUCKET = \"bdc2324-data\"\n", + "liste_folders = fs.ls(BUCKET)\n", + "\n", + "liste_files = []\n", + "for company_folder in liste_folders : \n", + " liste_files.extend(fs.ls(company_folder))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "6b1736d1-8fd7-4fcc-9431-b8bf0c7b4f2b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['bdc2324-data/1/1suppliers.csv', 'bdc2324-data/10/10suppliers.csv', 'bdc2324-data/101/101suppliers.csv', 'bdc2324-data/11/11suppliers.csv', 'bdc2324-data/12/12suppliers.csv', 'bdc2324-data/13/13suppliers.csv', 'bdc2324-data/14/14suppliers.csv', 'bdc2324-data/2/2suppliers.csv', 'bdc2324-data/3/3suppliers.csv', 'bdc2324-data/4/4suppliers.csv', 'bdc2324-data/5/5suppliers.csv', 'bdc2324-data/6/6suppliers.csv', 'bdc2324-data/7/7suppliers.csv', 'bdc2324-data/8/8suppliers.csv', 'bdc2324-data/9/9suppliers.csv']\n" + ] + } + ], + "source": [ + "liste_database_select = ['suppliers']\n", + "\n", + "# Filtrer la liste pour les éléments contenant au moins un élément de la liste à tester\n", + "liste_suppliers = [element for element in liste_files if any(element_part in element for element_part in liste_database_select)]\n", + "\n", + "# Afficher le résultat\n", + "print(liste_suppliers)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "226b694b-0b00-4167-b69f-3178902254eb", + "metadata": {}, + "outputs": [], + "source": [ + "# loop to create dataframes from file 2\n", + "def database_loading(database_name = None):\n", + " files_path = database_name\n", + " \n", + " client_number = files_path[0].split(\"/\")[1]\n", + " df_prefix = \"df\" + str(client_number) + \"_\"\n", + " \n", + " for i in range(len(files_path)) :\n", + " current_path = files_path[i]\n", + " with fs.open(current_path, mode=\"rb\") as file_in:\n", + " df = pd.read_csv(file_in)\n", + " # the pattern of the name is df1xxx\n", + " nom_dataframe = df_prefix + re.search(r'\\/(\\d+)\\/(\\d+)([a-zA-Z_]+)\\.csv$', current_path).group(3)\n", + " globals()[nom_dataframe] = df\n", + "\n", + " " + ] } ], "metadata": {