210 lines
8.5 KiB
Python
210 lines
8.5 KiB
Python
|
|
import pandas as pd
|
||
|
|
import numpy as np
|
||
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
||
|
|
from datetime import datetime, timedelta
|
||
|
|
from load import load
|
||
|
|
|
||
|
|
class AssetReconciler:
|
||
|
|
|
||
|
|
def __init__(self, df_aum, df_flows):
|
||
|
|
"""
|
||
|
|
Initialize with the raw AUM and Flows dataframes.
|
||
|
|
"""
|
||
|
|
self.df_aum = df_aum.copy()
|
||
|
|
self.df_flows = df_flows.copy()
|
||
|
|
|
||
|
|
# Basic cleaning
|
||
|
|
print("Parsing dates...")
|
||
|
|
self.df_aum['Centralisation Date'] = pd.to_datetime(self.df_aum['Centralisation Date'], errors='coerce')
|
||
|
|
self.df_flows['Centralisation Date'] = pd.to_datetime(self.df_flows['Centralisation Date'], errors='coerce')
|
||
|
|
|
||
|
|
# Standardize column names for internal use
|
||
|
|
self.col_id = 'Registrar Account - ID'
|
||
|
|
self.col_isin = 'Product - Isin'
|
||
|
|
self.col_qty_aum = 'Quantity - AUM'
|
||
|
|
self.col_qty_flow = 'Quantity - NetFlows'
|
||
|
|
self.col_block = 'Registrar Account - Country'
|
||
|
|
|
||
|
|
def _get_portfolio_matrix(self, df, date_filter):
|
||
|
|
"""
|
||
|
|
Pivots data to create a matrix: Index=AccountID, Columns=ISINs, Values=Quantity
|
||
|
|
"""
|
||
|
|
subset = df[df['Centralisation Date'] == date_filter]
|
||
|
|
if subset.empty:
|
||
|
|
print(f"Warning: No data found for date {date_filter}")
|
||
|
|
return pd.DataFrame()
|
||
|
|
|
||
|
|
# Pivot: Rows are Clients, Columns are ISINs
|
||
|
|
matrix = subset.pivot_table(
|
||
|
|
index=self.col_id,
|
||
|
|
columns=self.col_isin,
|
||
|
|
values=self.col_qty_aum,
|
||
|
|
aggfunc='sum'
|
||
|
|
).fillna(0)
|
||
|
|
|
||
|
|
return matrix
|
||
|
|
|
||
|
|
def _get_aggregated_flows(self, start_date, end_date):
|
||
|
|
"""
|
||
|
|
Sums up net flows per Account/ISIN between two dates.
|
||
|
|
"""
|
||
|
|
mask = (self.df_flows['Centralisation Date'] > start_date) & (self.df_flows['Centralisation Date'] <= end_date)
|
||
|
|
subset = self.df_flows[mask]
|
||
|
|
|
||
|
|
if subset.empty:
|
||
|
|
return pd.DataFrame()
|
||
|
|
|
||
|
|
flow_matrix = subset.pivot_table(
|
||
|
|
index=self.col_id,
|
||
|
|
columns=self.col_isin,
|
||
|
|
values=self.col_qty_flow,
|
||
|
|
aggfunc='sum'
|
||
|
|
).fillna(0)
|
||
|
|
|
||
|
|
return flow_matrix
|
||
|
|
|
||
|
|
def match_accounts(self, date_past, date_current, similarity_threshold=0.95, magnitude_tolerance=0.2):
|
||
|
|
"""
|
||
|
|
Main logic to link Past Accounts to Current Accounts.
|
||
|
|
"""
|
||
|
|
print(f"--- Running Matching Algorithm ---")
|
||
|
|
print(f"Comparing State: {date_past.date()} -> {date_current.date()}")
|
||
|
|
|
||
|
|
# 1. Get AUM Snapshots
|
||
|
|
mat_past = self._get_portfolio_matrix(self.df_aum, date_past)
|
||
|
|
mat_curr = self._get_portfolio_matrix(self.df_aum, date_current)
|
||
|
|
|
||
|
|
if mat_past.empty or mat_curr.empty:
|
||
|
|
return pd.DataFrame()
|
||
|
|
|
||
|
|
# 2. Get Flows
|
||
|
|
mat_flows = self._get_aggregated_flows(date_past, date_current)
|
||
|
|
|
||
|
|
# 3. Blocking Strategy
|
||
|
|
# We only compare accounts if they belong to the same Country (or other stable attribute)
|
||
|
|
# To do this efficiently, we create a mapping of ID -> Country
|
||
|
|
past_countries = self.df_aum[self.df_aum['Centralisation Date'] == date_past].set_index(self.col_id)[self.col_block].to_dict()
|
||
|
|
|
||
|
|
results = []
|
||
|
|
|
||
|
|
# Iterate through unique countries to reduce matrix size (Blocking)
|
||
|
|
unique_countries = set(past_countries.values())
|
||
|
|
|
||
|
|
for country in unique_countries:
|
||
|
|
# Filter matrices for this block
|
||
|
|
# Identify IDs in this country
|
||
|
|
ids_in_country_past = [i for i in mat_past.index if past_countries.get(i) == country]
|
||
|
|
|
||
|
|
# Note: For Current IDs, we need to fetch their country too.
|
||
|
|
curr_countries = self.df_aum[self.df_aum['Centralisation Date'] == date_current].set_index(self.col_id)[self.col_block].to_dict()
|
||
|
|
ids_in_country_curr = [i for i in mat_curr.index if curr_countries.get(i) == country]
|
||
|
|
|
||
|
|
if not ids_in_country_past or not ids_in_country_curr:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Slice the matrices
|
||
|
|
block_past = mat_past.loc[ids_in_country_past]
|
||
|
|
block_curr = mat_curr.loc[ids_in_country_curr]
|
||
|
|
|
||
|
|
# Align Flows to this block
|
||
|
|
block_flows = pd.DataFrame(0, index=block_past.index, columns=block_past.columns)
|
||
|
|
if not mat_flows.empty:
|
||
|
|
# Only use flows for IDs that exist in the past block
|
||
|
|
common_ids = block_past.index.intersection(mat_flows.index)
|
||
|
|
common_isins = block_past.columns.intersection(mat_flows.columns)
|
||
|
|
if not common_ids.empty:
|
||
|
|
block_flows.loc[common_ids, common_isins] = mat_flows.loc[common_ids, common_isins]
|
||
|
|
|
||
|
|
# 4. Reconstruction: Expected State = Past + Flows
|
||
|
|
expected_state = block_past.add(block_flows, fill_value=0)
|
||
|
|
|
||
|
|
# 5. Align Dimensions (Union of ISINs)
|
||
|
|
all_isins = list(set(expected_state.columns) | set(block_curr.columns))
|
||
|
|
vec_expected = expected_state.reindex(columns=all_isins, fill_value=0)
|
||
|
|
vec_actual = block_curr.reindex(columns=all_isins, fill_value=0)
|
||
|
|
|
||
|
|
# 6. Calculate Cosine Similarity
|
||
|
|
# Result is a matrix: Rows=PastIDs, Cols=CurrentIDs
|
||
|
|
sim_matrix = cosine_similarity(vec_expected, vec_actual)
|
||
|
|
|
||
|
|
# 7. Find Best Matches
|
||
|
|
for idx, past_id in enumerate(vec_expected.index):
|
||
|
|
# Find best score in the row
|
||
|
|
best_idx = np.argmax(sim_matrix[idx])
|
||
|
|
best_score = sim_matrix[idx][best_idx]
|
||
|
|
curr_id = vec_actual.index[best_idx]
|
||
|
|
|
||
|
|
# 8. Magnitude Check (Euclidean safeguard)
|
||
|
|
# Ensure we don't match a small retail client to a huge institutional one
|
||
|
|
total_shares_exp = vec_expected.loc[past_id].sum()
|
||
|
|
total_shares_act = vec_actual.loc[curr_id].sum()
|
||
|
|
|
||
|
|
# Calculate ratio (handle div by zero)
|
||
|
|
if total_shares_act == 0:
|
||
|
|
ratio = 0
|
||
|
|
else:
|
||
|
|
ratio = total_shares_exp / total_shares_act
|
||
|
|
|
||
|
|
# Check if ratio is within tolerance (e.g. 0.8 to 1.2)
|
||
|
|
is_magnitude_ok = (1 - magnitude_tolerance) <= ratio <= (1 + magnitude_tolerance)
|
||
|
|
|
||
|
|
match_status = "Unmatched"
|
||
|
|
if best_score >= similarity_threshold and is_magnitude_ok:
|
||
|
|
match_status = "High Confidence Match"
|
||
|
|
elif past_id == curr_id:
|
||
|
|
match_status = "Same ID (Retained)"
|
||
|
|
|
||
|
|
results.append({
|
||
|
|
'Past_ID': past_id,
|
||
|
|
'Predicted_Current_ID': curr_id,
|
||
|
|
'Similarity_Score': round(best_score, 4),
|
||
|
|
'Magnitude_Ratio': round(ratio, 4),
|
||
|
|
'Match_Status': match_status,
|
||
|
|
'Country': country
|
||
|
|
})
|
||
|
|
|
||
|
|
return pd.DataFrame(results)
|
||
|
|
|
||
|
|
# ==========================================
|
||
|
|
# MAIN EXECUTION
|
||
|
|
# ==========================================
|
||
|
|
if __name__ == "__main__":
|
||
|
|
# 1. Load Data
|
||
|
|
df_aum, df_flows = load()
|
||
|
|
|
||
|
|
# 2. Initialize Logic
|
||
|
|
reconciler = AssetReconciler(df_aum, df_flows)
|
||
|
|
|
||
|
|
# 3. AUTO-DETECT DATES
|
||
|
|
available_dates = sorted(reconciler.df_aum['Centralisation Date'].unique())
|
||
|
|
|
||
|
|
if len(available_dates) < 2:
|
||
|
|
print("\nERROR: Not enough dates found in data to perform comparison.")
|
||
|
|
print(f"Dates found: {[str(d) for d in available_dates]}")
|
||
|
|
else:
|
||
|
|
# Automatically pick the First and Last date found in the file
|
||
|
|
date_past = available_dates[0] # First date
|
||
|
|
date_current = available_dates[-1] # Last date
|
||
|
|
|
||
|
|
print(f"\nAuto-Detected Analysis Period:")
|
||
|
|
print(f"Start (Past): {date_past}")
|
||
|
|
print(f"End (Current): {date_current}")
|
||
|
|
|
||
|
|
# 4. Run Linkage
|
||
|
|
results = reconciler.match_accounts(date_past, date_current)
|
||
|
|
|
||
|
|
# 5. Calculate KPI
|
||
|
|
if not results.empty:
|
||
|
|
total_past = len(results)
|
||
|
|
matches = len(results[results['Match_Status'].isin(['High Confidence Match', 'Same ID (Retained)'])])
|
||
|
|
kpi_percentage = (matches / total_past) * 100
|
||
|
|
|
||
|
|
print("\n" + "="*40)
|
||
|
|
print("FINAL RESULTS")
|
||
|
|
print("="*40)
|
||
|
|
print(results[['Past_ID', 'Predicted_Current_ID', 'Similarity_Score', 'Magnitude_Ratio', 'Match_Status']])
|
||
|
|
print("-" * 40)
|
||
|
|
print(f"KPI: {kpi_percentage:.1f}% of Past Client Codes successfully linked to Current Codes.")
|
||
|
|
print("="*40)
|
||
|
|
else:
|
||
|
|
print("No matches found.")
|