Ajout travail Alexis

This commit is contained in:
Antoine JOUBREL 2024-02-10 18:53:59 +00:00
parent 071b9fda36
commit 5f02915ae9

View File

@ -1759,6 +1759,772 @@
"# Construction des variables explicatives"
]
},
{
"cell_type": "markdown",
"id": "b09c2964-bef9-489e-ad71-84959054531b",
"metadata": {},
"source": [
"## Alexis' work"
]
},
{
"cell_type": "code",
"execution_count": 142,
"id": "4ab1c0d2-0097-4669-b984-b6822c976740",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>event_type_id</th>\n",
" <th>avg_amount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2</td>\n",
" <td>6.150659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>4</td>\n",
" <td>7.762474</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>4.452618</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>6</td>\n",
" <td>6.439463</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" event_type_id avg_amount\n",
"0 2 6.150659\n",
"1 4 7.762474\n",
"2 5 4.452618\n",
"3 6 6.439463"
]
},
"execution_count": 142,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"avg_amount = (df1_products_purchased_reduced.groupby([\"event_type_id\"])\n",
" .agg({\"amount\" : \"mean\"}).reset_index()\n",
" .rename(columns = {'amount' : 'avg_amount'}))\n",
"\n",
"avg_amount"
]
},
{
"cell_type": "code",
"execution_count": 143,
"id": "a9c62b39-389e-4dac-89a6-ac8a59fea58a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>event_type_id</th>\n",
" <th>nb_tickets</th>\n",
" <th>avg_amount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>384226</td>\n",
" <td>6.150659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>453242</td>\n",
" <td>7.762474</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>201750</td>\n",
" <td>4.452618</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>217356</td>\n",
" <td>6.439463</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>143</td>\n",
" <td>6.150659</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" customer_id event_type_id nb_tickets avg_amount\n",
"0 1 2 384226 6.150659\n",
"1 1 4 453242 7.762474\n",
"2 1 5 201750 4.452618\n",
"3 1 6 217356 6.439463\n",
"4 2 2 143 6.150659"
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb_tickets = (df1_products_purchased_reduced.groupby([\"customer_id\", \"event_type_id\"])\n",
" .agg({\"ticket_id\" : \"count\"}).reset_index()\n",
" .rename(columns = {'ticket_id' : 'nb_tickets'})\n",
" .merge(avg_amount, how='left', on='event_type_id'))\n",
"nb_tickets.head()"
]
},
{
"cell_type": "code",
"execution_count": 144,
"id": "8710611c-7eb8-45ca-bdcc-009f4081f9e2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>birthdate</th>\n",
" <th>street_id</th>\n",
" <th>is_partner</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>last_buying_date</th>\n",
" <th>max_price</th>\n",
" <th>ticket_sum</th>\n",
" <th>average_price</th>\n",
" <th>fidelity</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>12751</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>12825</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11261</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13071</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>653061</td>\n",
" <td>NaN</td>\n",
" <td>10</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>80.0</td>\n",
" <td>2.0</td>\n",
" <td>0 days 19:53:02.500000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
"0 12751 NaN 2 False 1 True \n",
"1 12825 NaN 2 False 2 True \n",
"2 11261 NaN 2 False 1 True \n",
"3 13071 NaN 2 False 2 True \n",
"4 653061 NaN 10 False 2 True \n",
"\n",
" opt_in structure_id profession language mcp_contact_id last_buying_date \\\n",
"0 True NaN NaN NaN NaN NaN \n",
"1 True NaN NaN NaN NaN NaN \n",
"2 True NaN NaN NaN NaN NaN \n",
"3 True NaN NaN NaN NaN NaN \n",
"4 False NaN NaN NaN NaN NaN \n",
"\n",
" max_price ticket_sum average_price fidelity average_purchase_delay \\\n",
"0 NaN 0 0.0 0 NaN \n",
"1 NaN 0 0.0 0 NaN \n",
"2 NaN 0 0.0 0 NaN \n",
"3 NaN 0 0.0 0 NaN \n",
"4 NaN 0 0.0 0 NaN \n",
"\n",
" average_price_basket average_ticket_basket total_price purchase_count \\\n",
"0 NaN NaN NaN 0 \n",
"1 NaN NaN NaN 0 \n",
"2 NaN NaN NaN 0 \n",
"3 NaN NaN NaN 0 \n",
"4 NaN NaN NaN 0 \n",
"\n",
" first_buying_date country age tenant_id nb_campaigns \\\n",
"0 NaT fr NaN 1311 NaN \n",
"1 NaT fr NaN 1311 NaN \n",
"2 NaT fr NaN 1311 NaN \n",
"3 NaT fr NaN 1311 NaN \n",
"4 NaT NaN NaN 1311 80.0 \n",
"\n",
" nb_campaigns_opened time_to_open \n",
"0 NaN NaT \n",
"1 NaN NaT \n",
"2 NaN NaT \n",
"3 NaN NaT \n",
"4 2.0 0 days 19:53:02.500000 "
]
},
"execution_count": 144,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Fusion avec KPI campaigns liés au customer\n",
"df1_customer = pd.merge(df1_customerplus_clean, df1_campaigns_kpi, on = 'customer_id', how = 'left')\n",
"df1_customer.head()"
]
},
{
"cell_type": "code",
"execution_count": 146,
"id": "a89fad43-ee68-4081-9384-3e9f08ec6a59",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape : (156289, 31)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>customer_id</th>\n",
" <th>birthdate</th>\n",
" <th>street_id</th>\n",
" <th>is_partner</th>\n",
" <th>gender</th>\n",
" <th>is_email_true</th>\n",
" <th>opt_in</th>\n",
" <th>structure_id</th>\n",
" <th>profession</th>\n",
" <th>language</th>\n",
" <th>mcp_contact_id</th>\n",
" <th>last_buying_date</th>\n",
" <th>max_price</th>\n",
" <th>ticket_sum</th>\n",
" <th>average_price</th>\n",
" <th>fidelity</th>\n",
" <th>average_purchase_delay</th>\n",
" <th>average_price_basket</th>\n",
" <th>average_ticket_basket</th>\n",
" <th>total_price</th>\n",
" <th>purchase_count</th>\n",
" <th>first_buying_date</th>\n",
" <th>country</th>\n",
" <th>age</th>\n",
" <th>tenant_id</th>\n",
" <th>nb_campaigns</th>\n",
" <th>nb_campaigns_opened</th>\n",
" <th>time_to_open</th>\n",
" <th>event_type_id</th>\n",
" <th>nb_tickets</th>\n",
" <th>avg_amount</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>12751</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>12825</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>11261</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>13071</td>\n",
" <td>NaN</td>\n",
" <td>2</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>653061</td>\n",
" <td>NaN</td>\n",
" <td>10</td>\n",
" <td>False</td>\n",
" <td>2</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaT</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1311</td>\n",
" <td>80.0</td>\n",
" <td>2.0</td>\n",
" <td>0 days 19:53:02.500000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" customer_id birthdate street_id is_partner gender is_email_true \\\n",
"0 12751 NaN 2 False 1 True \n",
"1 12825 NaN 2 False 2 True \n",
"2 11261 NaN 2 False 1 True \n",
"3 13071 NaN 2 False 2 True \n",
"4 653061 NaN 10 False 2 True \n",
"\n",
" opt_in structure_id profession language mcp_contact_id last_buying_date \\\n",
"0 True NaN NaN NaN NaN NaN \n",
"1 True NaN NaN NaN NaN NaN \n",
"2 True NaN NaN NaN NaN NaN \n",
"3 True NaN NaN NaN NaN NaN \n",
"4 False NaN NaN NaN NaN NaN \n",
"\n",
" max_price ticket_sum average_price fidelity average_purchase_delay \\\n",
"0 NaN 0 0.0 0 NaN \n",
"1 NaN 0 0.0 0 NaN \n",
"2 NaN 0 0.0 0 NaN \n",
"3 NaN 0 0.0 0 NaN \n",
"4 NaN 0 0.0 0 NaN \n",
"\n",
" average_price_basket average_ticket_basket total_price purchase_count \\\n",
"0 NaN NaN NaN 0 \n",
"1 NaN NaN NaN 0 \n",
"2 NaN NaN NaN 0 \n",
"3 NaN NaN NaN 0 \n",
"4 NaN NaN NaN 0 \n",
"\n",
" first_buying_date country age tenant_id nb_campaigns \\\n",
"0 NaT fr NaN 1311 NaN \n",
"1 NaT fr NaN 1311 NaN \n",
"2 NaT fr NaN 1311 NaN \n",
"3 NaT fr NaN 1311 NaN \n",
"4 NaT NaN NaN 1311 80.0 \n",
"\n",
" nb_campaigns_opened time_to_open event_type_id nb_tickets \\\n",
"0 NaN NaT NaN NaN \n",
"1 NaN NaT NaN NaN \n",
"2 NaN NaT NaN NaN \n",
"3 NaN NaT NaN NaN \n",
"4 2.0 0 days 19:53:02.500000 NaN NaN \n",
"\n",
" avg_amount \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN "
]
},
"execution_count": 146,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1_customer_product = pd.merge(df1_customer, nb_tickets, on = 'customer_id', how = 'left')\n",
"print(\"shape : \", df1_customer_product.shape)\n",
"df1_customer_product.head()"
]
},
{
"cell_type": "code",
"execution_count": 147,
"id": "a19fec00-4ece-400c-937c-ce5cd8daccfd",
"metadata": {},
"outputs": [],
"source": [
"df1_customer_product.to_csv(\"customer_product.csv\", index = False)"
]
},
{
"cell_type": "markdown",
"id": "314f1b7f-ae48-4c6f-8469-9ce879043243",