diff --git a/0_Cleaning_and_merge.ipynb b/0_Cleaning_and_merge.ipynb index 68efb55..be348ed 100644 --- a/0_Cleaning_and_merge.ipynb +++ b/0_Cleaning_and_merge.ipynb @@ -1759,6 +1759,772 @@ "# Construction des variables explicatives" ] }, + { + "cell_type": "markdown", + "id": "b09c2964-bef9-489e-ad71-84959054531b", + "metadata": {}, + "source": [ + "## Alexis' work" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "id": "4ab1c0d2-0097-4669-b984-b6822c976740", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
event_type_idavg_amount
026.150659
147.762474
254.452618
366.439463
\n", + "
" + ], + "text/plain": [ + " event_type_id avg_amount\n", + "0 2 6.150659\n", + "1 4 7.762474\n", + "2 5 4.452618\n", + "3 6 6.439463" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "avg_amount = (df1_products_purchased_reduced.groupby([\"event_type_id\"])\n", + " .agg({\"amount\" : \"mean\"}).reset_index()\n", + " .rename(columns = {'amount' : 'avg_amount'}))\n", + "\n", + "avg_amount" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "a9c62b39-389e-4dac-89a6-ac8a59fea58a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idevent_type_idnb_ticketsavg_amount
0123842266.150659
1144532427.762474
2152017504.452618
3162173566.439463
4221436.150659
\n", + "
" + ], + "text/plain": [ + " customer_id event_type_id nb_tickets avg_amount\n", + "0 1 2 384226 6.150659\n", + "1 1 4 453242 7.762474\n", + "2 1 5 201750 4.452618\n", + "3 1 6 217356 6.439463\n", + "4 2 2 143 6.150659" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nb_tickets = (df1_products_purchased_reduced.groupby([\"customer_id\", \"event_type_id\"])\n", + " .agg({\"ticket_id\" : \"count\"}).reset_index()\n", + " .rename(columns = {'ticket_id' : 'nb_tickets'})\n", + " .merge(avg_amount, how='left', on='event_type_id'))\n", + "nb_tickets.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguagemcp_contact_idlast_buying_datemax_priceticket_sumaverage_pricefidelityaverage_purchase_delayaverage_price_basketaverage_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_open
012751NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
112825NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
211261NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
313071NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaT
4653061NaN10False2TrueFalseNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000
\n", + "
" + ], + "text/plain": [ + " customer_id birthdate street_id is_partner gender is_email_true \\\n", + "0 12751 NaN 2 False 1 True \n", + "1 12825 NaN 2 False 2 True \n", + "2 11261 NaN 2 False 1 True \n", + "3 13071 NaN 2 False 2 True \n", + "4 653061 NaN 10 False 2 True \n", + "\n", + " opt_in structure_id profession language mcp_contact_id last_buying_date \\\n", + "0 True NaN NaN NaN NaN NaN \n", + "1 True NaN NaN NaN NaN NaN \n", + "2 True NaN NaN NaN NaN NaN \n", + "3 True NaN NaN NaN NaN NaN \n", + "4 False NaN NaN NaN NaN NaN \n", + "\n", + " max_price ticket_sum average_price fidelity average_purchase_delay \\\n", + "0 NaN 0 0.0 0 NaN \n", + "1 NaN 0 0.0 0 NaN \n", + "2 NaN 0 0.0 0 NaN \n", + "3 NaN 0 0.0 0 NaN \n", + "4 NaN 0 0.0 0 NaN \n", + "\n", + " average_price_basket average_ticket_basket total_price purchase_count \\\n", + "0 NaN NaN NaN 0 \n", + "1 NaN NaN NaN 0 \n", + "2 NaN NaN NaN 0 \n", + "3 NaN NaN NaN 0 \n", + "4 NaN NaN NaN 0 \n", + "\n", + " first_buying_date country age tenant_id nb_campaigns \\\n", + "0 NaT fr NaN 1311 NaN \n", + "1 NaT fr NaN 1311 NaN \n", + "2 NaT fr NaN 1311 NaN \n", + "3 NaT fr NaN 1311 NaN \n", + "4 NaT NaN NaN 1311 80.0 \n", + "\n", + " nb_campaigns_opened time_to_open \n", + "0 NaN NaT \n", + "1 NaN NaT \n", + "2 NaN NaT \n", + "3 NaN NaT \n", + "4 2.0 0 days 19:53:02.500000 " + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fusion avec KPI campaigns liƩs au customer\n", + "df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n", + "df1_customer.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "id": "a89fad43-ee68-4081-9384-3e9f08ec6a59", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape : (156289, 31)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idbirthdatestreet_idis_partnergenderis_email_trueopt_instructure_idprofessionlanguagemcp_contact_idlast_buying_datemax_priceticket_sumaverage_pricefidelityaverage_purchase_delayaverage_price_basketaverage_ticket_baskettotal_pricepurchase_countfirst_buying_datecountryagetenant_idnb_campaignsnb_campaigns_openedtime_to_openevent_type_idnb_ticketsavg_amount
012751NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
112825NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
211261NaN2False1TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
313071NaN2False2TrueTrueNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTfrNaN1311NaNNaNNaTNaNNaNNaN
4653061NaN10False2TrueFalseNaNNaNNaNNaNNaNNaN00.00NaNNaNNaNNaN0NaTNaNNaN131180.02.00 days 19:53:02.500000NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " customer_id birthdate street_id is_partner gender is_email_true \\\n", + "0 12751 NaN 2 False 1 True \n", + "1 12825 NaN 2 False 2 True \n", + "2 11261 NaN 2 False 1 True \n", + "3 13071 NaN 2 False 2 True \n", + "4 653061 NaN 10 False 2 True \n", + "\n", + " opt_in structure_id profession language mcp_contact_id last_buying_date \\\n", + "0 True NaN NaN NaN NaN NaN \n", + "1 True NaN NaN NaN NaN NaN \n", + "2 True NaN NaN NaN NaN NaN \n", + "3 True NaN NaN NaN NaN NaN \n", + "4 False NaN NaN NaN NaN NaN \n", + "\n", + " max_price ticket_sum average_price fidelity average_purchase_delay \\\n", + "0 NaN 0 0.0 0 NaN \n", + "1 NaN 0 0.0 0 NaN \n", + "2 NaN 0 0.0 0 NaN \n", + "3 NaN 0 0.0 0 NaN \n", + "4 NaN 0 0.0 0 NaN \n", + "\n", + " average_price_basket average_ticket_basket total_price purchase_count \\\n", + "0 NaN NaN NaN 0 \n", + "1 NaN NaN NaN 0 \n", + "2 NaN NaN NaN 0 \n", + "3 NaN NaN NaN 0 \n", + "4 NaN NaN NaN 0 \n", + "\n", + " first_buying_date country age tenant_id nb_campaigns \\\n", + "0 NaT fr NaN 1311 NaN \n", + "1 NaT fr NaN 1311 NaN \n", + "2 NaT fr NaN 1311 NaN \n", + "3 NaT fr NaN 1311 NaN \n", + "4 NaT NaN NaN 1311 80.0 \n", + "\n", + " nb_campaigns_opened time_to_open event_type_id nb_tickets \\\n", + "0 NaN NaT NaN NaN \n", + "1 NaN NaT NaN NaN \n", + "2 NaN NaT NaN NaN \n", + "3 NaN NaT NaN NaN \n", + "4 2.0 0 days 19:53:02.500000 NaN NaN \n", + "\n", + " avg_amount \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n", + "print(\"shape : \", df1_customer_product.shape)\n", + "df1_customer_product.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "id": "a19fec00-4ece-400c-937c-ce5cd8daccfd", + "metadata": {}, + "outputs": [], + "source": [ + "df1_customer_product.to_csv(\"customer_product.csv\", index = False)" + ] + }, { "cell_type": "markdown", "id": "314f1b7f-ae48-4c6f-8469-9ce879043243",