You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
319 lines
8.9 KiB
319 lines
8.9 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# CTA 1D Label Analysis\n",
|
|
"\n",
|
|
"Explore label distributions and compare different normalization blending strategies.\n",
|
|
"\n",
|
|
"**Purpose**: Understand how different normalization methods affect label distributions and identify optimal blending."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"\n",
|
|
"from qshare.data.pandas.cta_1d.dataset import load_features, load_label\n",
|
|
"from qshare.data.pandas.cta_1d.label import normalize_label_dual, normalize_label\n",
|
|
"from qshare.io.ddb.cta import load_cta_returns\n",
|
|
"\n",
|
|
"import sys\n",
|
|
"sys.path.insert(0, '../')\n",
|
|
"from common.plotting import setup_plot_style\n",
|
|
"from src.labels import BLEND_CONFIGS, get_blend_weights, describe_blend_config\n",
|
|
"\n",
|
|
"setup_plot_style()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 1. Configuration"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"CONFIG = {\n",
|
|
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
|
|
" 'fit_range': ['2020-01-01', '2021-12-31'], # For zscore normalization\n",
|
|
" 'return_type': 'o2c_twap1min',\n",
|
|
"}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Load Raw Returns\n",
|
|
"\n",
|
|
"Load the raw return series before any normalization."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load returns\n",
|
|
"print(\"Loading raw returns...\")\n",
|
|
"df_returns = load_cta_returns(\n",
|
|
" since_date=CONFIG['dt_range'][0],\n",
|
|
" end_date=CONFIG['dt_range'][1],\n",
|
|
")\n",
|
|
"\n",
|
|
"return_col = CONFIG['return_type']\n",
|
|
"raw_returns = df_returns[return_col].copy()\n",
|
|
"\n",
|
|
"print(f\"\\nRaw {return_col} returns:\")\n",
|
|
"print(raw_returns.describe())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Plot raw return distribution\n",
|
|
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
|
|
"\n",
|
|
"# Histogram\n",
|
|
"raw_returns.hist(bins=100, ax=axes[0], edgecolor='black')\n",
|
|
"axes[0].set_title(f'Raw {return_col} Distribution')\n",
|
|
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
|
|
"\n",
|
|
"# Time series\n",
|
|
"daily_mean = raw_returns.groupby(level=0).mean()\n",
|
|
"axes[1].plot(daily_mean.index, daily_mean.values)\n",
|
|
"axes[1].set_title('Daily Mean Return')\n",
|
|
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Compare Normalization Methods\n",
|
|
"\n",
|
|
"Apply each normalization method individually and compare."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load dominant contract mapping for proper label construction\n",
|
|
"from qshare.io.ddb.cta import load_cta_dominant_contracts\n",
|
|
"\n",
|
|
"print(\"Loading dominant contract mapping...\")\n",
|
|
"df_dominant = load_cta_dominant_contracts(\n",
|
|
" since_date=CONFIG['dt_range'][0],\n",
|
|
" end_date=CONFIG['dt_range'][1],\n",
|
|
")\n",
|
|
"\n",
|
|
"# Merge returns with dominant mapping\n",
|
|
"df_merged = df_dominant.join(raw_returns, how='left')\n",
|
|
"\n",
|
|
"# Calculate different normalization methods\n",
|
|
"print(\"\\nApplying normalization methods...\")\n",
|
|
"\n",
|
|
"norm_results = {}\n",
|
|
"\n",
|
|
"# zscore (fit-time)\n",
|
|
"norm_results['zscore'] = normalize_label(\n",
|
|
" df_merged[return_col],\n",
|
|
" method='zscore',\n",
|
|
" fit_range=CONFIG['fit_range']\n",
|
|
")\n",
|
|
"\n",
|
|
"# cs_zscore (cross-sectional)\n",
|
|
"norm_results['cs_zscore'] = df_merged.groupby(level=0)[return_col].apply(\n",
|
|
" lambda x: (x - x.mean()) / (x.std() + 1e-8)\n",
|
|
")\n",
|
|
"\n",
|
|
"# rolling_20\n",
|
|
"norm_results['rolling_20'] = normalize_label(\n",
|
|
" df_merged[return_col],\n",
|
|
" method='rolling',\n",
|
|
" window=20\n",
|
|
")\n",
|
|
"\n",
|
|
"# rolling_60\n",
|
|
"norm_results['rolling_60'] = normalize_label(\n",
|
|
" df_merged[return_col],\n",
|
|
" method='rolling',\n",
|
|
" window=60\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Done!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Compare distributions\n",
|
|
"fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
|
|
"axes = axes.flatten()\n",
|
|
"\n",
|
|
"for i, (method, series) in enumerate(norm_results.items()):\n",
|
|
" ax = axes[i]\n",
|
|
" series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n",
|
|
" ax.set_title(f'{method}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n",
|
|
" ax.axvline(x=0, color='red', linestyle='--')\n",
|
|
" ax.set_xlim(-5, 5) # Focus on main distribution\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 4. Compare Blend Configurations\n",
|
|
"\n",
|
|
"Compare different blending strategies."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Apply each blend configuration\n",
|
|
"blend_results = {}\n",
|
|
"\n",
|
|
"for name in BLEND_CONFIGS.keys():\n",
|
|
" weights = get_blend_weights(name)\n",
|
|
" print(f\"\\nProcessing {name}: {weights}\")\n",
|
|
" \n",
|
|
" # Calculate blended label\n",
|
|
" blended = (\n",
|
|
" weights[0] * norm_results['zscore'] +\n",
|
|
" weights[1] * norm_results['cs_zscore'] +\n",
|
|
" weights[2] * norm_results['rolling_20'] +\n",
|
|
" weights[3] * norm_results['rolling_60']\n",
|
|
" )\n",
|
|
" \n",
|
|
" blend_results[name] = blended\n",
|
|
" print(f\" Mean: {blended.mean():.4f}, Std: {blended.std():.4f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Visualize all blend distributions\n",
|
|
"fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n",
|
|
"axes = axes.flatten()\n",
|
|
"\n",
|
|
"for i, (name, series) in enumerate(blend_results.items()):\n",
|
|
" ax = axes[i]\n",
|
|
" series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n",
|
|
" weights = get_blend_weights(name)\n",
|
|
" ax.set_title(f'{name}\\nweights={weights}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n",
|
|
" ax.axvline(x=0, color='red', linestyle='--')\n",
|
|
" ax.set_xlim(-5, 5)\n",
|
|
"\n",
|
|
"# Hide last subplot if not used\n",
|
|
"if len(blend_results) < 6:\n",
|
|
" axes[-1].axis('off')\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 5. Correlation Analysis\n",
|
|
"\n",
|
|
"Check correlations between different normalization methods."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Create comparison DataFrame\n",
|
|
"comparison_df = pd.DataFrame(norm_results)\n",
|
|
"\n",
|
|
"# Add raw returns\n",
|
|
"comparison_df['raw'] = df_merged[return_col]\n",
|
|
"\n",
|
|
"# Calculate correlation matrix\n",
|
|
"corr = comparison_df.corr()\n",
|
|
"\n",
|
|
"# Plot heatmap\n",
|
|
"fig, ax = plt.subplots(figsize=(8, 6))\n",
|
|
"sns.heatmap(corr, annot=True, cmap='RdBu_r', center=0,\n",
|
|
" vmin=-1, vmax=1, ax=ax)\n",
|
|
"ax.set_title('Correlation: Normalization Methods')\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"# Rolling correlation analysis\n",
|
|
"window = 60\n",
|
|
"\n",
|
|
"# Calculate rolling correlation between zscore and cs_zscore\n",
|
|
"rolling_corr = norm_results['zscore'].rolling(window).corr(norm_results['cs_zscore'])\n",
|
|
"\n",
|
|
"fig, ax = plt.subplots(figsize=(14, 4))\n",
|
|
"ax.plot(rolling_corr.index.get_level_values(0).unique(), rolling_corr.groupby(level=0).mean())\n",
|
|
"ax.set_title(f'Rolling Correlation: zscore vs cs_zscore ({window}d window)')\n",
|
|
"ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.5)\n",
|
|
"ax.set_ylim(-1, 1)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python",
|
|
"version": "3.8.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
} |