diff --git a/Exploration_billet_AJ.ipynb b/Exploration_billet_AJ.ipynb index f1c2e31..b92df45 100644 --- a/Exploration_billet_AJ.ipynb +++ b/Exploration_billet_AJ.ipynb @@ -524,6 +524,65 @@ "export_in_temporary(target_agg, 'Target_kpi_concatenate')" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb6f06e6-78de-4b8d-a103-8366eff0493a", + "metadata": {}, + "outputs": [], + "source": [ + "v" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5e864b1-adad-4267-b956-3f7ef371d677", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def display_covering_time(df, company, datecover):\n", + " \"\"\"\n", + " This function draws the time coverage of each company\n", + " \"\"\"\n", + " min_date = df['purchase_date'].min().strftime(\"%Y-%m-%d\")\n", + " max_date = df['purchase_date'].max().strftime(\"%Y-%m-%d\")\n", + " datecover[company] = [datetime.strptime(min_date, \"%Y-%m-%d\") + timedelta(days=x) for x in range((datetime.strptime(max_date, \"%Y-%m-%d\") - datetime.strptime(min_date, \"%Y-%m-%d\")).days)]\n", + " print(f'Couverture Company {company} : {min_date} - {max_date}')\n", + " return datecover\n", + "\n", + "\n", + "def compute_time_intersection(datecover):\n", + " \"\"\"\n", + " This function returns the time coverage for all companies\n", + " \"\"\"\n", + " timestamps_sets = [set(timestamps) for timestamps in datecover.values()]\n", + " intersection = set.intersection(*timestamps_sets)\n", + " intersection_list = list(intersection)\n", + " formated_dates = [dt.strftime(\"%Y-%m-%d\") for dt in intersection_list]\n", + " return sorted(formated_dates)\n", + "\n", + "\n", + "def df_coverage_modelization(sport, coverage_features = 0.7):\n", + " \"\"\"\n", + " This function returns start_date, end_of_features and final dates\n", + " that help to construct train and test datasets\n", + " \"\"\"\n", + " datecover = {}\n", + " for company in sport:\n", + " df_products_purchased_reduced = display_input_databases(company, file_name = \"products_purchased_reduced\",\n", + " datetime_col = ['purchase_date'])\n", + " datecover = display_covering_time(df_products_purchased_reduced, company, datecover)\n", + " #print(datecover.keys())\n", + " dt_coverage = compute_time_intersection(datecover)\n", + " start_date = dt_coverage[0]\n", + " end_of_features = dt_coverage[int(0.7 * len(dt_coverage))]\n", + " final_date = dt_coverage[-1]\n", + " return start_date, end_of_features, final_date\n", + " " + ] + }, { "cell_type": "markdown", "id": "2435097a-95a5-43e1-84d0-7f6b701441ba",