Initial alpha_lab structure\n\n- Notebook-centric experiment framework\n- CTA 1D and Stock 15m tasks\n- Minimal common utilities\n- Manual experiment tracking

master
guofu 3 weeks ago
commit cdf6373325

@ -0,0 +1,14 @@
# Alpha Lab Environment Configuration
# Copy this file to .env and fill in your values
# DolphinDB Configuration
DDB_HOST=192.168.1.146
DDB_PORT=8848
DDB_USERNAME=
DDB_PASSWORD=
# Data Paths
DATA_ROOT=/data/parquet
# Experiment Output
RESULTS_ROOT=/home/guofu/Workspaces/alpha_lab/results

51
.gitignore vendored

@ -0,0 +1,51 @@
# Environment
.env
.venv/
env/
venv/
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Jupyter
.ipynb_checkpoints/
*.ipynb_checkpoints
# Results and data
results/*
!results/*/.gitkeep
!results/*/README.md
*.parquet
*.pkl
*.h5
*.feather
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
Thumbs.db

@ -0,0 +1,87 @@
# Alpha Lab
Quantitative research experiments for qshare library. This repository contains Jupyter notebooks and analysis scripts for exploring trading strategies and machine learning models.
## Philosophy
- **Notebook-centric**: Experiments are interactive notebooks, not rigid scripts
- **Minimal abstraction**: Simple functions over complex class hierarchies
- **Self-contained**: Each task directory is independent
- **Ad-hoc friendly**: Easy to modify for exploration
## Structure
```
alpha_lab/
├── common/ # Shared utilities (keep minimal!)
│ ├── paths.py # Path management
│ └── plotting.py # Common plotting functions
├── cta_1d/ # CTA 1-day return prediction
│ ├── 01_data_check.ipynb
│ ├── 02_label_analysis.ipynb
│ ├── 03_baseline_xgb.ipynb
│ ├── 04_blend_comparison.ipynb
│ └── src/ # Task-specific helpers
├── stock_15m/ # Stock 15-minute return prediction
│ ├── 01_data_exploration.ipynb
│ ├── 02_baseline_model.ipynb
│ └── src/
└── results/ # Output directory (gitignored)
├── cta_1d/
└── stock_15m/
```
## Setup
```bash
# Install dependencies
pip install -r requirements.txt
# Create environment file
cp .env.template .env
# Edit .env with your settings
```
## Usage
Start Jupyter and run notebooks interactively:
```bash
jupyter notebook
```
Each task directory contains numbered notebooks:
- `01_*.ipynb` - Data loading and exploration
- `02_*.ipynb` - Analysis and baseline models
- `03_*.ipynb` - Advanced experiments
- `04_*.ipynb` - Comparisons and ablations
## Experiment Tracking
Experiments are tracked manually in `results/{task}/README.md`:
```markdown
## 2025-01-15: Baseline XGB
- Notebook: `cta_1d/03_baseline_xgb.ipynb` (cells 1-50)
- Config: eta=0.5, lambda=0.1
- Train IC: 0.042
- Test IC: 0.038
- Notes: Dual normalization, 4 trades/day
```
## Adding a New Task
1. Create directory: `mkdir my_task`
2. Add `src/` subdirectory for helpers
3. Create numbered notebooks
4. Add entry to `results/my_task/README.md`
## Best Practices
1. **Keep it simple**: Only add to `common/` after 3+ copies
2. **Notebook configs**: Define CONFIG dict in first cell for easy modification
3. **Document results**: Update results README after significant runs
4. **Git discipline**: Don't commit large files, results, or credentials

@ -0,0 +1,13 @@
"""Common utilities for alpha_lab experiments."""
from .paths import ensure_dir, get_results_dir, get_task_results_dir
from .plotting import setup_plot_style, plot_ic_series, plot_cumulative_returns
__all__ = [
'ensure_dir',
'get_results_dir',
'get_task_results_dir',
'setup_plot_style',
'plot_ic_series',
'plot_cumulative_returns',
]

@ -0,0 +1,42 @@
"""Path utilities for experiment outputs."""
import os
from pathlib import Path
from datetime import datetime
# Base directories
BASE_DIR = Path(__file__).parent.parent
RESULTS_DIR = BASE_DIR / "results"
def ensure_dir(path: Path) -> Path:
"""Create directory if it doesn't exist."""
path.mkdir(parents=True, exist_ok=True)
return path
def get_results_dir() -> Path:
"""Get base results directory."""
return ensure_dir(RESULTS_DIR)
def get_task_results_dir(task_name: str) -> Path:
"""Get results directory for a specific task."""
return ensure_dir(RESULTS_DIR / task_name)
def create_experiment_dir(task_name: str, experiment_name: str | None = None) -> Path:
"""Create a timestamped directory for an experiment.
Args:
task_name: Name of the task (e.g., 'cta_1d', 'stock_15m')
experiment_name: Optional experiment name (default: timestamp)
Returns:
Path to the created directory
"""
if experiment_name is None:
experiment_name = datetime.now().strftime('%Y%m%d_%H%M%S')
exp_dir = RESULTS_DIR / task_name / experiment_name
return ensure_dir(exp_dir)

@ -0,0 +1,119 @@
"""Common plotting utilities for experiments."""
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
def setup_plot_style():
"""Set up default plotting style."""
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10
def plot_ic_series(ic_by_date: pd.Series, title: str = "IC Over Time",
figsize: tuple = (14, 4)) -> plt.Figure:
"""Plot IC time series with rolling mean.
Args:
ic_by_date: Series with datetime index and IC values
title: Plot title
figsize: Figure size
Returns:
Matplotlib figure
"""
fig, ax = plt.subplots(figsize=figsize)
# Plot raw IC
ax.plot(ic_by_date.index, ic_by_date.values, alpha=0.5, color='gray', label='Daily IC')
# Plot rolling mean
rolling = ic_by_date.rolling(20, min_periods=5).mean()
ax.plot(rolling.index, rolling.values, color='blue', linewidth=2, label='20-day MA')
# Add mean line
mean_ic = ic_by_date.mean()
ax.axhline(y=mean_ic, color='red', linestyle='--',
label=f'Mean IC: {mean_ic:.4f}')
ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax.set_title(title)
ax.set_xlabel('Date')
ax.set_ylabel('Information Coefficient')
ax.legend(loc='upper right')
plt.tight_layout()
return fig
def plot_cumulative_returns(returns: pd.Series, title: str = "Cumulative Returns",
figsize: tuple = (12, 6)) -> plt.Figure:
"""Plot cumulative returns.
Args:
returns: Series with datetime index and daily returns
title: Plot title
figsize: Figure size
Returns:
Matplotlib figure
"""
fig, ax = plt.subplots(figsize=figsize)
cumulative = (1 + returns).cumprod()
ax.plot(cumulative.index, cumulative.values, linewidth=1.5)
ax.set_title(title)
ax.set_xlabel('Date')
ax.set_ylabel('Cumulative Return')
ax.set_yscale('log')
# Add final return annotation
final_return = cumulative.iloc[-1] - 1
ax.annotate(f'{final_return:.2%}',
xy=(cumulative.index[-1], cumulative.iloc[-1]),
xytext=(10, 0), textcoords='offset points',
fontsize=10, color='green' if final_return > 0 else 'red')
plt.tight_layout()
return fig
def plot_factor_distribution(factor: pd.Series, title: str = "Factor Distribution",
figsize: tuple = (10, 6)) -> plt.Figure:
"""Plot factor distribution with statistics.
Args:
factor: Series of factor values
title: Plot title
figsize: Figure size
Returns:
Matplotlib figure
"""
fig, axes = plt.subplots(1, 2, figsize=figsize)
# Histogram
axes[0].hist(factor.dropna(), bins=100, alpha=0.7, edgecolor='black')
axes[0].set_title(f'{title} - Distribution')
axes[0].set_xlabel('Value')
axes[0].set_ylabel('Frequency')
# Q-Q plot
from scipy import stats
stats.probplot(factor.dropna(), dist="norm", plot=axes[1])
axes[1].set_title(f'{title} - Q-Q Plot')
# Add statistics text
stats_text = f"Mean: {factor.mean():.4f}\nStd: {factor.std():.4f}\n"
stats_text += f"Skew: {factor.skew():.4f}\nKurt: {factor.kurtosis():.4f}"
axes[0].text(0.95, 0.95, stats_text, transform=axes[0].transAxes,
verticalalignment='top', horizontalalignment='right',
bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
plt.tight_layout()
return fig

@ -0,0 +1,277 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CTA 1D Data Check\n",
"\n",
"Load and validate CTA futures data.\n",
"\n",
"**Purpose**: Verify data availability, check basic statistics, and understand data structure before modeling."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from qshare.data.pandas.cta_1d import load_dataset\n",
"from qshare.io.ddb.cta import load_cta_alpha158, load_cta_hffactors, load_cta_returns\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../')\n",
"from common.plotting import setup_plot_style\n",
"\n",
"setup_plot_style()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configuration\n",
"\n",
"Modify these parameters as needed for your data check."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = {\n",
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
" 'feature_sets': ['alpha158', 'hffactor'],\n",
" 'return_type': 'o2c_twap1min', # or 'o2o_twap1min'\n",
" 'normalization': 'dual',\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load Features Separately\n",
"\n",
"Check each feature set independently."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load alpha158 features\n",
"print(\"Loading alpha158 features...\")\n",
"df_alpha158 = load_cta_alpha158(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"print(f\"alpha158 shape: {df_alpha158.shape}\")\n",
"print(f\"\")\n",
"print(f\"Columns: {list(df_alpha158.columns[:10])}...\") # First 10 columns\n",
"print(f\"\")\n",
"print(f\"Date range: {df_alpha158.index.get_level_values(0).min()} to {df_alpha158.index.get_level_values(0).max()}\")\n",
"print(f\"Instruments: {df_alpha158.index.get_level_values(1).nunique()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load HF factors\n",
"print(\"Loading hffactor features...\")\n",
"df_hf = load_cta_hffactors(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"print(f\"hffactor shape: {df_hf.shape}\")\n",
"print(f\"\")\n",
"print(f\"Columns: {list(df_hf.columns[:10])}...\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Load Returns (Labels)\n",
"\n",
"Check return indicators that will be used as prediction targets."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load return indicators\n",
"print(\"Loading return indicators...\")\n",
"df_returns = load_cta_returns(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"print(f\"Returns shape: {df_returns.shape}\")\n",
"print(f\"\")\n",
"print(f\"Available return types:\")\n",
"for col in df_returns.columns:\n",
" print(f\" - {col}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check specific return type\n",
"return_col = CONFIG['return_type']\n",
"if return_col in df_returns.columns:\n",
" print(f\"\\n{return_col} statistics:\")\n",
" print(df_returns[return_col].describe())\n",
" \n",
" # Plot distribution\n",
" fig, ax = plt.subplots(figsize=(10, 4))\n",
" df_returns[return_col].hist(bins=100, ax=ax, edgecolor='black')\n",
" ax.set_title(f'{return_col} Distribution')\n",
" ax.axvline(x=0, color='red', linestyle='--')\n",
" plt.show()\n",
"else:\n",
" print(f\"Warning: {return_col} not found in returns data\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Load Full Dataset\n",
"\n",
"Load the complete training dataset with features and labels."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load full dataset\n",
"print(\"Loading full dataset...\")\n",
"df_full = load_dataset(\n",
" dt_range=CONFIG['dt_range'],\n",
" return_type=CONFIG['return_type'],\n",
" normalization=CONFIG['normalization'],\n",
" feature_sets=CONFIG['feature_sets'],\n",
")\n",
"\n",
"print(f\"\\nFull dataset shape: {df_full.shape}\")\n",
"print(f\"\")\n",
"print(f\"Columns: {len(df_full.columns)} total\")\n",
"print(f\" - Features: {len([c for c in df_full.columns if c.startswith(('alpha158_', 'hf_'))])}\")\n",
"print(f\" - Label: 'label'\")\n",
"print(f\" - Weight: 'weight'\")\n",
"print(f\" - Return: 'return'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check for missing values\n",
"missing = df_full.isnull().sum()\n",
"missing_cols = missing[missing > 0]\n",
"\n",
"if len(missing_cols) > 0:\n",
" print(f\"\\nColumns with missing values:\")\n",
" print(missing_cols.head(10))\n",
"else:\n",
" print(\"\\nNo missing values found!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Label statistics\n",
"print(\"\\nLabel statistics:\")\n",
"print(df_full['label'].describe())\n",
"\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
"\n",
"# Distribution\n",
"df_full['label'].hist(bins=100, ax=axes[0], edgecolor='black')\n",
"axes[0].set_title('Label Distribution')\n",
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
"\n",
"# Time series of mean label by date\n",
"label_by_date = df_full.groupby(level=0)['label'].mean()\n",
"axes[1].plot(label_by_date.index, label_by_date.values)\n",
"axes[1].set_title('Mean Label by Date')\n",
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Summary\n",
"\n",
"Check data availability by instrument and date."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Data availability heatmap\n",
"available = df_full.groupby([df_full.index.get_level_values(0).date, df_full.index.get_level_values(1)]).size().unstack(fill_value=0)\n",
"available = (available > 0).astype(int)\n",
"\n",
"print(f\"Data availability: {available.sum().sum()} instrument-date pairs\")\n",
"print(f\"Instruments: {len(available.columns)}\")\n",
"print(f\"Dates: {len(available.index)}\")\n",
"\n",
"# Plot coverage\n",
"fig, ax = plt.subplots(figsize=(14, 6))\n",
"im = ax.imshow(available.T.values, aspect='auto', cmap='RdYlGn', interpolation='nearest')\n",
"ax.set_title('Data Availability (Green=Available, Red=Missing)')\n",
"ax.set_xlabel('Time')\n",
"ax.set_ylabel('Instrument')\n",
"plt.colorbar(im, ax=ax)\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,319 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CTA 1D Label Analysis\n",
"\n",
"Explore label distributions and compare different normalization blending strategies.\n",
"\n",
"**Purpose**: Understand how different normalization methods affect label distributions and identify optimal blending."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from qshare.data.pandas.cta_1d.dataset import load_features, load_label\n",
"from qshare.data.pandas.cta_1d.label import normalize_label_dual, normalize_label\n",
"from qshare.io.ddb.cta import load_cta_returns\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../')\n",
"from common.plotting import setup_plot_style\n",
"from src.labels import BLEND_CONFIGS, get_blend_weights, describe_blend_config\n",
"\n",
"setup_plot_style()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = {\n",
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
" 'fit_range': ['2020-01-01', '2021-12-31'], # For zscore normalization\n",
" 'return_type': 'o2c_twap1min',\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load Raw Returns\n",
"\n",
"Load the raw return series before any normalization."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load returns\n",
"print(\"Loading raw returns...\")\n",
"df_returns = load_cta_returns(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"\n",
"return_col = CONFIG['return_type']\n",
"raw_returns = df_returns[return_col].copy()\n",
"\n",
"print(f\"\\nRaw {return_col} returns:\")\n",
"print(raw_returns.describe())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot raw return distribution\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
"\n",
"# Histogram\n",
"raw_returns.hist(bins=100, ax=axes[0], edgecolor='black')\n",
"axes[0].set_title(f'Raw {return_col} Distribution')\n",
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
"\n",
"# Time series\n",
"daily_mean = raw_returns.groupby(level=0).mean()\n",
"axes[1].plot(daily_mean.index, daily_mean.values)\n",
"axes[1].set_title('Daily Mean Return')\n",
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Compare Normalization Methods\n",
"\n",
"Apply each normalization method individually and compare."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load dominant contract mapping for proper label construction\n",
"from qshare.io.ddb.cta import load_cta_dominant_contracts\n",
"\n",
"print(\"Loading dominant contract mapping...\")\n",
"df_dominant = load_cta_dominant_contracts(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"\n",
"# Merge returns with dominant mapping\n",
"df_merged = df_dominant.join(raw_returns, how='left')\n",
"\n",
"# Calculate different normalization methods\n",
"print(\"\\nApplying normalization methods...\")\n",
"\n",
"norm_results = {}\n",
"\n",
"# zscore (fit-time)\n",
"norm_results['zscore'] = normalize_label(\n",
" df_merged[return_col],\n",
" method='zscore',\n",
" fit_range=CONFIG['fit_range']\n",
")\n",
"\n",
"# cs_zscore (cross-sectional)\n",
"norm_results['cs_zscore'] = df_merged.groupby(level=0)[return_col].apply(\n",
" lambda x: (x - x.mean()) / (x.std() + 1e-8)\n",
")\n",
"\n",
"# rolling_20\n",
"norm_results['rolling_20'] = normalize_label(\n",
" df_merged[return_col],\n",
" method='rolling',\n",
" window=20\n",
")\n",
"\n",
"# rolling_60\n",
"norm_results['rolling_60'] = normalize_label(\n",
" df_merged[return_col],\n",
" method='rolling',\n",
" window=60\n",
")\n",
"\n",
"print(\"Done!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Compare distributions\n",
"fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
"axes = axes.flatten()\n",
"\n",
"for i, (method, series) in enumerate(norm_results.items()):\n",
" ax = axes[i]\n",
" series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n",
" ax.set_title(f'{method}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n",
" ax.axvline(x=0, color='red', linestyle='--')\n",
" ax.set_xlim(-5, 5) # Focus on main distribution\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Compare Blend Configurations\n",
"\n",
"Compare different blending strategies."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Apply each blend configuration\n",
"blend_results = {}\n",
"\n",
"for name in BLEND_CONFIGS.keys():\n",
" weights = get_blend_weights(name)\n",
" print(f\"\\nProcessing {name}: {weights}\")\n",
" \n",
" # Calculate blended label\n",
" blended = (\n",
" weights[0] * norm_results['zscore'] +\n",
" weights[1] * norm_results['cs_zscore'] +\n",
" weights[2] * norm_results['rolling_20'] +\n",
" weights[3] * norm_results['rolling_60']\n",
" )\n",
" \n",
" blend_results[name] = blended\n",
" print(f\" Mean: {blended.mean():.4f}, Std: {blended.std():.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Visualize all blend distributions\n",
"fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n",
"axes = axes.flatten()\n",
"\n",
"for i, (name, series) in enumerate(blend_results.items()):\n",
" ax = axes[i]\n",
" series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n",
" weights = get_blend_weights(name)\n",
" ax.set_title(f'{name}\\nweights={weights}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n",
" ax.axvline(x=0, color='red', linestyle='--')\n",
" ax.set_xlim(-5, 5)\n",
"\n",
"# Hide last subplot if not used\n",
"if len(blend_results) < 6:\n",
" axes[-1].axis('off')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Correlation Analysis\n",
"\n",
"Check correlations between different normalization methods."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create comparison DataFrame\n",
"comparison_df = pd.DataFrame(norm_results)\n",
"\n",
"# Add raw returns\n",
"comparison_df['raw'] = df_merged[return_col]\n",
"\n",
"# Calculate correlation matrix\n",
"corr = comparison_df.corr()\n",
"\n",
"# Plot heatmap\n",
"fig, ax = plt.subplots(figsize=(8, 6))\n",
"sns.heatmap(corr, annot=True, cmap='RdBu_r', center=0,\n",
" vmin=-1, vmax=1, ax=ax)\n",
"ax.set_title('Correlation: Normalization Methods')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Rolling correlation analysis\n",
"window = 60\n",
"\n",
"# Calculate rolling correlation between zscore and cs_zscore\n",
"rolling_corr = norm_results['zscore'].rolling(window).corr(norm_results['cs_zscore'])\n",
"\n",
"fig, ax = plt.subplots(figsize=(14, 4))\n",
"ax.plot(rolling_corr.index.get_level_values(0).unique(), rolling_corr.groupby(level=0).mean())\n",
"ax.set_title(f'Rolling Correlation: zscore vs cs_zscore ({window}d window)')\n",
"ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.5)\n",
"ax.set_ylim(-1, 1)\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,356 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CTA 1D Baseline XGBoost Model\n",
"\n",
"Train and evaluate a baseline XGBoost model for CTA 1-day return prediction.\n",
"\n",
"**Purpose**: Establish a baseline performance benchmark with standard configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import json\n",
"from datetime import datetime\n",
"\n",
"from qshare.data.pandas.cta_1d import load_dataset\n",
"from qshare.algo.learning.cta_trainer import CTAXGBTrainer\n",
"from qshare.eval.cta.backtest import CTABacktester\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../')\n",
"from common.plotting import setup_plot_style, plot_ic_series, plot_cumulative_returns\n",
"from common.paths import create_experiment_dir\n",
"from src.labels import get_blend_weights, describe_blend_config\n",
"\n",
"setup_plot_style()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configuration\n",
"\n",
"Edit this cell to modify experiment parameters."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = {\n",
" # Experiment\n",
" 'experiment_name': 'baseline_xgb', # Will be appended with timestamp\n",
" \n",
" # Date ranges\n",
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
" 'train_range': ['2020-01-01', '2022-12-31'],\n",
" 'test_range': ['2023-01-01', '2024-12-31'],\n",
" 'fit_range': ['2020-01-01', '2021-06-30'], # For normalization fitting\n",
" \n",
" # Data\n",
" 'feature_sets': ['alpha158', 'hffactor'],\n",
" 'return_type': 'o2c_twap1min',\n",
" 'normalization': 'dual',\n",
" 'blend_weights': None, # Use default [0.2, 0.1, 0.3, 0.4] or specify name/list\n",
" 'weight_factors': {'positive': 1.0, 'negative': 2.0},\n",
" \n",
" # Model\n",
" 'xgb_params': {\n",
" 'booster': 'gblinear',\n",
" 'eta': 0.5,\n",
" 'lambda_reg': 0.1,\n",
" 'num_round': 20,\n",
" },\n",
" \n",
" # Backtest\n",
" 'backtest_params': {\n",
" 'num_trades': 4,\n",
" 'signal_dist': 'normal',\n",
" 'pos_weight': True,\n",
" },\n",
" \n",
" # Output\n",
" 'save_results': True,\n",
"}\n",
"\n",
"print(\"Configuration:\")\n",
"print(f\" Experiment: {CONFIG['experiment_name']}\")\n",
"print(f\" Train: {CONFIG['train_range'][0]} to {CONFIG['train_range'][1]}\")\n",
"print(f\" Test: {CONFIG['test_range'][0]} to {CONFIG['test_range'][1]}\")\n",
"print(f\" Blend: {describe_blend_config(CONFIG['blend_weights'] or 'default')}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Loading dataset...\")\n",
"df_full = load_dataset(\n",
" dt_range=CONFIG['dt_range'],\n",
" return_type=CONFIG['return_type'],\n",
" normalization=CONFIG['normalization'],\n",
" feature_sets=CONFIG['feature_sets'],\n",
" fit_range=CONFIG['fit_range'],\n",
" weight_factors=CONFIG['weight_factors'],\n",
" blend_weights=CONFIG['blend_weights'],\n",
")\n",
"\n",
"print(f\"\\nDataset shape: {df_full.shape}\")\n",
"print(f\"Columns: {len(df_full.columns)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Split train/test\n",
"df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n",
"df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n",
"\n",
"print(f\"Train: {df_train.shape}\")\n",
"print(f\"Test: {df_test.shape}\")\n",
"\n",
"# Get feature columns\n",
"feature_cols = [c for c in df_train.columns\n",
" if c.startswith(('alpha158_', 'hf_', 'f_'))]\n",
"print(f\"\\nFeatures: {len(feature_cols)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Train Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Training XGBoost model...\")\n",
"print(f\" Params: {CONFIG['xgb_params']}\")\n",
"\n",
"trainer = CTAXGBTrainer(**CONFIG['xgb_params'])\n",
"\n",
"trainer.fit(\n",
" df_train,\n",
" feature_cols=feature_cols,\n",
" target_col='label',\n",
" weight_col='weight'\n",
")\n",
"\n",
"print(\"\\nTraining complete!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Feature importance\n",
"importance = trainer.get_feature_importance()\n",
"print(\"\\nTop 10 Features:\")\n",
"print(importance.head(10))\n",
"\n",
"# Plot\n",
"fig, ax = plt.subplots(figsize=(10, 6))\n",
"importance.head(20).plot(kind='barh', ax=ax)\n",
"ax.set_title('Top 20 Feature Importance')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Generate Predictions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Generating predictions on test set...\")\n",
"df_signal = trainer.predict(df_test)\n",
"\n",
"print(f\"\\nSignal statistics:\")\n",
"print(df_signal.describe())\n",
"\n",
"# Plot signal distribution\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
"\n",
"df_signal.hist(bins=100, ax=axes[0], edgecolor='black')\n",
"axes[0].set_title('Signal Distribution')\n",
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
"\n",
"signal_by_date = df_signal.groupby(level=0).mean()\n",
"axes[1].plot(signal_by_date.index, signal_by_date.values)\n",
"axes[1].set_title('Mean Signal by Date')\n",
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Evaluate with Backtest"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Running backtest...\")\n",
"\n",
"returns = df_test['return'] if 'return' in df_test.columns else df_test['label']\n",
"\n",
"backtester = CTABacktester(**CONFIG['backtest_params'])\n",
"results = backtester.run(returns, df_signal)\n",
"\n",
"summary = backtester.summary()\n",
"print(\"\\nBacktest Summary:\")\n",
"for key, value in summary.items():\n",
" if isinstance(value, float):\n",
" print(f\" {key}: {value:.4f}\")\n",
" else:\n",
" print(f\" {key}: {value}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# IC Analysis\n",
"ic_by_date = results.groupby(results.index.get_level_values(0))['ic'].mean()\n",
"\n",
"fig = plot_ic_series(ic_by_date, title=\"IC Over Time (Test Set)\")\n",
"plt.show()\n",
"\n",
"print(f\"\\nIC Statistics:\")\n",
"print(f\" Mean: {ic_by_date.mean():.4f}\")\n",
"print(f\" Std: {ic_by_date.std():.4f}\")\n",
"print(f\" IR: {ic_by_date.mean() / ic_by_date.std():.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cumulative returns\n",
"daily_returns = results.groupby(results.index.get_level_values(0))['pos_return'].mean()\n",
"\n",
"fig = plot_cumulative_returns(daily_returns, title=\"Cumulative Strategy Returns\")\n",
"plt.show()\n",
"\n",
"total_return = (1 + daily_returns).prod() - 1\n",
"annual_return = (1 + total_return) ** (252 / len(daily_returns)) - 1\n",
"sharpe = daily_returns.mean() / daily_returns.std() * np.sqrt(252)\n",
"\n",
"print(f\"\\nReturn Statistics:\")\n",
"print(f\" Total Return: {total_return:.2%}\")\n",
"print(f\" Annual Return: {annual_return:.2%}\")\n",
"print(f\" Sharpe Ratio: {sharpe:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Save Results\n",
"\n",
"Save model, predictions, and metrics for later analysis."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if CONFIG['save_results']:\n",
" # Create output directory\n",
" output_dir = create_experiment_dir('cta_1d', CONFIG['experiment_name'])\n",
" print(f\"Saving results to: {output_dir}\")\n",
" \n",
" # Save config\n",
" with open(output_dir / 'config.json', 'w') as f:\n",
" json.dump(CONFIG, f, indent=2, default=str)\n",
" \n",
" # Save model\n",
" trainer.save_model(str(output_dir / 'model.pkl'))\n",
" \n",
" # Save feature importance\n",
" importance.to_csv(output_dir / 'feature_importance.csv')\n",
" \n",
" # Save predictions\n",
" df_signal.to_csv(output_dir / 'predictions.csv')\n",
" \n",
" # Save backtest results\n",
" results.to_csv(output_dir / 'backtest_results.csv')\n",
" \n",
" # Save summary\n",
" with open(output_dir / 'summary.json', 'w') as f:\n",
" json.dump(summary, f, indent=2, default=str)\n",
" \n",
" print(\"\\nFiles saved:\")\n",
" for f in output_dir.iterdir():\n",
" print(f\" - {f.name}\")\n",
"else:\n",
" print(\"Results not saved (save_results=False)\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,439 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CTA 1D Blend Comparison\n",
"\n",
"Compare model performance across different label blending configurations.\n",
"\n",
"**Purpose**: Identify the optimal normalization blend for the CTA 1-day prediction task."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from qshare.data.pandas.cta_1d import load_dataset\n",
"from qshare.algo.learning.cta_trainer import CTAXGBTrainer\n",
"from qshare.eval.cta.backtest import CTABacktester\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../')\n",
"from common.plotting import setup_plot_style, plot_ic_series\n",
"from src.labels import BLEND_CONFIGS, get_blend_weights\n",
"\n",
"setup_plot_style()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configuration\n",
"\n",
"Define base configuration shared across all blend experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"BASE_CONFIG = {\n",
" # Date ranges\n",
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
" 'train_range': ['2020-01-01', '2022-12-31'],\n",
" 'test_range': ['2023-01-01', '2024-12-31'],\n",
" 'fit_range': ['2020-01-01', '2021-06-30'],\n",
" \n",
" # Data\n",
" 'feature_sets': ['alpha158', 'hffactor'],\n",
" 'return_type': 'o2c_twap1min',\n",
" 'normalization': 'dual',\n",
" 'weight_factors': {'positive': 1.0, 'negative': 2.0},\n",
" \n",
" # Model (fixed for fair comparison)\n",
" 'xgb_params': {\n",
" 'booster': 'gblinear',\n",
" 'eta': 0.5,\n",
" 'lambda_reg': 0.1,\n",
" 'num_round': 20,\n",
" },\n",
" \n",
" # Backtest\n",
" 'backtest_params': {\n",
" 'num_trades': 4,\n",
" 'signal_dist': 'normal',\n",
" 'pos_weight': True,\n",
" },\n",
"}\n",
"\n",
"print(\"Blend configurations to compare:\")\n",
"for name, weights in BLEND_CONFIGS.items():\n",
" print(f\" {name}: {weights}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Run Experiments\n",
"\n",
"Train and evaluate a model for each blend configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def run_single_experiment(blend_name, blend_weights):\n",
" \"\"\"Run experiment with specific blend configuration.\"\"\"\n",
" print(f\"\\n{'='*60}\")\n",
" print(f\"Running: {blend_name}\")\n",
" print(f\"Weights: {blend_weights}\")\n",
" print(f\"{'='*60}\")\n",
" \n",
" # Load data\n",
" df_full = load_dataset(\n",
" dt_range=BASE_CONFIG['dt_range'],\n",
" return_type=BASE_CONFIG['return_type'],\n",
" normalization=BASE_CONFIG['normalization'],\n",
" feature_sets=BASE_CONFIG['feature_sets'],\n",
" fit_range=BASE_CONFIG['fit_range'],\n",
" weight_factors=BASE_CONFIG['weight_factors'],\n",
" blend_weights=blend_weights,\n",
" )\n",
" \n",
" # Split\n",
" df_train = df_full.loc[BASE_CONFIG['train_range'][0]:BASE_CONFIG['train_range'][1]]\n",
" df_test = df_full.loc[BASE_CONFIG['test_range'][0]:BASE_CONFIG['test_range'][1]]\n",
" \n",
" # Features\n",
" feature_cols = [c for c in df_train.columns\n",
" if c.startswith(('alpha158_', 'hf_', 'f_'))]\n",
" \n",
" # Train\n",
" trainer = CTAXGBTrainer(**BASE_CONFIG['xgb_params'])\n",
" trainer.fit(\n",
" df_train,\n",
" feature_cols=feature_cols,\n",
" target_col='label',\n",
" weight_col='weight'\n",
" )\n",
" \n",
" # Predict\n",
" df_signal = trainer.predict(df_test)\n",
" \n",
" # Backtest\n",
" returns = df_test['return'] if 'return' in df_test.columns else df_test['label']\n",
" backtester = CTABacktester(**BASE_CONFIG['backtest_params'])\n",
" results = backtester.run(returns, df_signal)\n",
" \n",
" # Metrics\n",
" summary = backtester.summary()\n",
" ic_by_date = results.groupby(results.index.get_level_values(0))['ic'].mean()\n",
" \n",
" return {\n",
" 'name': blend_name,\n",
" 'weights': blend_weights,\n",
" 'summary': summary,\n",
" 'ic_by_date': ic_by_date,\n",
" 'results': results,\n",
" 'importance': trainer.get_feature_importance(),\n",
" }\n",
"\n",
"# Run all experiments\n",
"all_results = []\n",
"for name in BLEND_CONFIGS.keys():\n",
" result = run_single_experiment(name, name)\n",
" all_results.append(result)\n",
" \n",
"print(\"\\n\\nAll experiments complete!\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Results Summary"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create comparison table\n",
"comparison_data = []\n",
"for r in all_results:\n",
" ic_mean = r['ic_by_date'].mean()\n",
" ic_std = r['ic_by_date'].std()\n",
" comparison_data.append({\n",
" 'Blend': r['name'],\n",
" 'Weights': str(r['weights']),\n",
" 'IC Mean': ic_mean,\n",
" 'IC Std': ic_std,\n",
" 'IR': ic_mean / ic_std if ic_std > 0 else 0,\n",
" 'Return': r['summary'].get('return', np.nan),\n",
" 'Sharpe': r['summary'].get('sharpe', np.nan),\n",
" 'Turnover': r['summary'].get('turnover', np.nan),\n",
" })\n",
"\n",
"df_comparison = pd.DataFrame(comparison_data)\n",
"\n",
"# Sort by IC Mean\n",
"df_comparison = df_comparison.sort_values('IC Mean', ascending=False)\n",
"\n",
"print(\"Comparison Summary (sorted by IC Mean):\")\n",
"print(df_comparison.to_string(index=False))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Visual comparison\n",
"fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
"\n",
"# IC Mean\n",
"axes[0, 0].barh(df_comparison['Blend'], df_comparison['IC Mean'])\n",
"axes[0, 0].set_title('IC Mean')\n",
"axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
"\n",
"# Information Ratio\n",
"axes[0, 1].barh(df_comparison['Blend'], df_comparison['IR'])\n",
"axes[0, 1].set_title('Information Ratio')\n",
"axes[0, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
"\n",
"# Return\n",
"axes[1, 0].barh(df_comparison['Blend'], df_comparison['Return'])\n",
"axes[1, 0].set_title('Return')\n",
"axes[1, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
"\n",
"# Sharpe\n",
"axes[1, 1].barh(df_comparison['Blend'], df_comparison['Sharpe'])\n",
"axes[1, 1].set_title('Sharpe Ratio')\n",
"axes[1, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. IC Time Series Comparison"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot IC series for all configurations\n",
"fig, ax = plt.subplots(figsize=(16, 6))\n",
"\n",
"for r in all_results:\n",
" ic_rolling = r['ic_by_date'].rolling(20, min_periods=5).mean()\n",
" ax.plot(ic_rolling.index, ic_rolling.values, label=r['name'], alpha=0.8)\n",
"\n",
"ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)\n",
"ax.set_title('Rolling IC Comparison (20-day MA)')\n",
"ax.set_xlabel('Date')\n",
"ax.set_ylabel('Information Coefficient')\n",
"ax.legend(loc='upper right')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Feature Importance Comparison"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get top features from each blend\n",
"n_top = 10\n",
"top_features_by_blend = {}\n",
"\n",
"for r in all_results:\n",
" top_features_by_blend[r['name']] = set(r['importance'].head(n_top).index)\n",
"\n",
"# Find common features across all blends\n",
"common_features = set.intersection(*top_features_by_blend.values())\n",
"print(f\"\\nCommon top-{n_top} features across all blends:\")\n",
"for f in sorted(common_features):\n",
" print(f\" - {f}\")\n",
"\n",
"# Find unique features per blend\n",
"print(\"\\nUnique top features by blend:\")\n",
"for name, features in top_features_by_blend.items():\n",
" unique = features - set.union(*(top_features_by_blend.values() - {features}))\n",
" if unique:\n",
" print(f\"\\n {name}:\")\n",
" for f in sorted(unique):\n",
" print(f\" - {f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Heatmap of top feature importance across blends\n",
"all_top_features = set.union(*top_features_by_blend.values())\n",
"\n",
"importance_matrix = []\n",
"for r in all_results:\n",
" row = []\n",
" for f in sorted(all_top_features):\n",
" if f in r['importance'].index:\n",
" row.append(r['importance'].loc[f, 'importance'])\n",
" else:\n",
" row.append(0)\n",
" importance_matrix.append(row)\n",
"\n",
"df_importance = pd.DataFrame(\n",
" importance_matrix,\n",
" index=[r['name'] for r in all_results],\n",
" columns=sorted(all_top_features)\n",
")\n",
"\n",
"fig, ax = plt.subplots(figsize=(14, 6))\n",
"sns.heatmap(df_importance, cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Importance'})\n",
"ax.set_title('Feature Importance Comparison Across Blends')\n",
"ax.set_xlabel('Features')\n",
"ax.set_ylabel('Blend Configuration')\n",
"plt.xticks(rotation=45, ha='right')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Custom Weight Exploration\n",
"\n",
"Test custom blend weights."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Define custom weights to test\n",
"CUSTOM_WEIGHTS = [\n",
" [0.0, 0.0, 0.5, 0.5], # Only rolling\n",
" [0.3, 0.3, 0.2, 0.2], # Fit-time heavy\n",
" [0.1, 0.4, 0.25, 0.25], # CS heavy + balanced rolling\n",
"]\n",
"\n",
"custom_results = []\n",
"for i, weights in enumerate(CUSTOM_WEIGHTS):\n",
" result = run_single_experiment(f'custom_{i+1}', weights)\n",
" custom_results.append(result)\n",
"\n",
"print(\"\\n\\nCustom weights experiments complete!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Compare custom with standard\n",
"all_comparison_data = comparison_data.copy()\n",
"\n",
"for r in custom_results:\n",
" ic_mean = r['ic_by_date'].mean()\n",
" ic_std = r['ic_by_date'].std()\n",
" all_comparison_data.append({\n",
" 'Blend': r['name'],\n",
" 'Weights': str(r['weights']),\n",
" 'IC Mean': ic_mean,\n",
" 'IC Std': ic_std,\n",
" 'IR': ic_mean / ic_std if ic_std > 0 else 0,\n",
" 'Return': r['summary'].get('return', np.nan),\n",
" 'Sharpe': r['summary'].get('sharpe', np.nan),\n",
" 'Turnover': r['summary'].get('turnover', np.nan),\n",
" })\n",
"\n",
"df_all = pd.DataFrame(all_comparison_data)\n",
"df_all = df_all.sort_values('IC Mean', ascending=False)\n",
"\n",
"print(\"All Results (standard + custom):\")\n",
"print(df_all.to_string(index=False))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Conclusion\n",
"\n",
"Summarize findings and recommend best blend configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Best configuration\n",
"best = df_comparison.iloc[0]\n",
"print(\"Recommended Blend Configuration:\")\n",
"print(f\" Name: {best['Blend']}\")\n",
"print(f\" Weights: {best['Weights']}\")\n",
"print(f\"\\nPerformance:\")\n",
"print(f\" IC Mean: {best['IC Mean']:.4f}\")\n",
"print(f\" IC Std: {best['IC Std']:.4f}\")\n",
"print(f\" IR: {best['IR']:.4f}\")\n",
"print(f\" Return: {best['Return']:.4f}\")\n",
"print(f\" Sharpe: {best['Sharpe']:.4f}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,36 @@
# CTA 1-Day Return Prediction
Experiments for predicting CTA (Commodity Trading Advisor) futures 1-day returns.
## Data
- **Features**: alpha158, hffactor
- **Labels**: Return indicators (o2c_twap1min, o2o_twap1min, etc.)
- **Normalization**: dual (blend of zscore, cs_zscore, rolling_20, rolling_60)
## Notebooks
| Notebook | Purpose |
|----------|---------|
| `01_data_check.ipynb` | Load and validate CTA data |
| `02_label_analysis.ipynb` | Explore label distributions and blending |
| `03_baseline_xgb.ipynb` | Train baseline XGBoost model |
| `04_blend_comparison.ipynb` | Compare different normalization blends |
## Blend Configurations
The label blending combines 4 normalization methods:
- **zscore**: Fit-time mean/std normalization
- **cs_zscore**: Cross-sectional z-score per datetime
- **rolling_20**: 20-day rolling window normalization
- **rolling_60**: 60-day rolling window normalization
Predefined weights (from qshare.config.research.cta.labels):
- `equal`: [0.25, 0.25, 0.25, 0.25]
- `zscore_heavy`: [0.5, 0.2, 0.15, 0.15]
- `rolling_heavy`: [0.1, 0.1, 0.3, 0.5]
- `cs_heavy`: [0.2, 0.5, 0.15, 0.15]
- `short_term`: [0.1, 0.1, 0.4, 0.4]
- `long_term`: [0.4, 0.2, 0.2, 0.2]
Default: [0.2, 0.1, 0.3, 0.4]

@ -0,0 +1,5 @@
"""CTA 1-day task-specific utilities."""
from .labels import get_blend_weights, describe_blend_config
__all__ = ['get_blend_weights', 'describe_blend_config']

@ -0,0 +1,63 @@
"""Label blending utilities for CTA experiments."""
from typing import Union, List
# Predefined blend configurations
BLEND_CONFIGS = {
'equal': [0.25, 0.25, 0.25, 0.25],
'zscore_heavy': [0.5, 0.2, 0.15, 0.15],
'rolling_heavy': [0.1, 0.1, 0.3, 0.5],
'cs_heavy': [0.2, 0.5, 0.15, 0.15],
'short_term': [0.1, 0.1, 0.4, 0.4],
'long_term': [0.4, 0.2, 0.2, 0.2],
}
DEFAULT_BLEND = [0.2, 0.1, 0.3, 0.4] # [zscore, cs_zscore, roll20, roll60]
def get_blend_weights(weights: Union[str, List[float], None]) -> List[float]:
"""Resolve blend weights from string name or list.
Args:
weights: Config name, list of 4 floats, or None for default
Returns:
List of 4 weights summing to 1.0
"""
if weights is None:
return DEFAULT_BLEND
if isinstance(weights, str):
if weights not in BLEND_CONFIGS:
raise ValueError(f"Unknown blend config: {weights}. "
f"Available: {list(BLEND_CONFIGS.keys())}")
return BLEND_CONFIGS[weights]
if isinstance(weights, (list, tuple)):
if len(weights) != 4:
raise ValueError(f"Blend weights must have 4 values, got {len(weights)}")
if abs(sum(weights) - 1.0) > 1e-6:
raise ValueError(f"Blend weights must sum to 1.0, got {sum(weights)}")
return list(weights)
raise ValueError(f"Invalid blend weights type: {type(weights)}")
def describe_blend_config(weights: Union[str, List[float]]) -> str:
"""Get human-readable description of blend config.
Args:
weights: Config name or list of weights
Returns:
Description string
"""
names = ['zscore', 'cs_zscore', 'rolling_20', 'rolling_60']
if isinstance(weights, str):
w = get_blend_weights(weights)
return f"{weights}: {dict(zip(names, w))}"
w = weights
return f"custom: {dict(zip(names, w))}"

@ -0,0 +1,22 @@
# Alpha Lab - Experiment dependencies
# The qshare library is already installed in the virtual environment
# Jupyter and visualization
jupyter>=7.0.0
matplotlib>=3.7.0
seaborn>=0.12.0
plotly>=5.18.0
# Data processing
pandas>=2.0.0
numpy>=1.24.0
polars>=0.20.0
pyarrow>=14.0.0
# Machine learning
xgboost>=2.0.0
scikit-learn>=1.3.0
# Utilities
tqdm>=4.65.0
python-dotenv>=1.0.0

@ -0,0 +1,810 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Stock 15m Data Exploration\n",
"\n",
"Load and explore 15-minute return prediction data.\n",
"\n",
"**Purpose**: Understand data structure, check data quality, and visualize key statistics."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import polars as pl\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from qshare.data.polars.ret15m import load_dataset, calculate_weights\n",
"from qshare.io.polars import load_from_pq\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../')\n",
"from common.plotting import setup_plot_style\n",
"\n",
"setup_plot_style()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configuration\n",
"\n",
"Define data paths and parameters."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = {\n",
" # Data paths (adjust as needed)\n",
" 'path_a158': '/data/parquet/stock_1min_alpha158',\n",
" 'path_kline': '/data/parquet/stock_1min',\n",
" 'path_kline_daily': '/data/parquet/stock_1day',\n",
" 'path_industry': '/data/parquet/industry_idx',\n",
" \n",
" # Date range\n",
" 'dt_range': ['2022-01-01', '2024-12-31'],\n",
" \n",
" # Normalization mode\n",
" 'normalization_mode': 'dual', # 'industry', 'cs_zscore', or 'dual'\n",
" \n",
" # Sample weights\n",
" 'positive_factor': 1.0,\n",
" 'negative_factor': 2.0,\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load Raw Data\n",
"\n",
"Load data as Polars lazy frames first."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load data sources\n",
"print(\"Loading data sources...\")\n",
"\n",
"pl_ldf_a158 = load_from_pq(\n",
" path=CONFIG['path_a158'],\n",
" table_alias=\"a158\",\n",
" start_time=CONFIG['dt_range'][0],\n",
" as_struct=True\n",
")\n",
"\n",
"pl_ldf_kline = load_from_pq(\n",
" path=CONFIG['path_kline'],\n",
" table_alias=\"kline_1min\",\n",
" start_time=CONFIG['dt_range'][0],\n",
" as_struct=True\n",
")\n",
"\n",
"pl_ldf_kline_daily = load_from_pq(\n",
" path=CONFIG['path_kline_daily'],\n",
" table_alias=\"kline_1day\",\n",
" start_time=CONFIG['dt_range'][0],\n",
")\n",
"\n",
"pl_ldf_industry = load_from_pq(\n",
" path=CONFIG['path_industry'],\n",
" table_alias=\"indus_idx\",\n",
" start_time=CONFIG['dt_range'][0],\n",
")\n",
"\n",
"print(\"Data sources loaded as lazy frames\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check schemas\n",
"print(\"Alpha158 schema:\")\n",
"print(pl_ldf_a158.schema)\n",
"\n",
"print(\"\\nKline 1min schema:\")\n",
"print(pl_ldf_kline.schema)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Load Training Dataset\n",
"\n",
"Use qshare's load_dataset to construct the full training data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Loading training dataset...\")\n",
"print(f\" Date range: {CONFIG['dt_range']}\")\n",
"print(f\" Normalization: {CONFIG['normalization_mode']}\")\n",
"\n",
"pl_df_train = load_dataset(\n",
" pl_ldf_a158_1min=pl_ldf_a158,\n",
" pl_ldf_kline_1min=pl_ldf_kline,\n",
" pl_ldf_kline_1day=pl_ldf_kline_daily,\n",
" pl_ldf_indus_idx=pl_ldf_industry,\n",
" dt_range=CONFIG['dt_range'],\n",
" normalization_mode=CONFIG['normalization_mode'],\n",
" negative_factor=CONFIG['negative_factor'],\n",
" positive_factor=CONFIG['positive_factor'],\n",
")\n",
"\n",
"# Convert to pandas for easier exploration\n",
"df_train = pl_df_train.to_pandas()\n",
"\n",
"print(f\"\\nDataset shape: {df_train.shape}\")\n",
"print(f\"Columns: {len(df_train.columns)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check column types\n",
"feature_cols = [c for c in df_train.columns if c.startswith('alpha158_')]\n",
"print(f\"\\nAlpha158 features: {len(feature_cols)}\")\n",
"print(f\" Example: {feature_cols[:5]}\")\n",
"\n",
"print(f\"\\nTarget column: {[c for c in df_train.columns if 'return' in c.lower()]}\")\n",
"print(f\"Weight column: {[c for c in df_train.columns if 'weight' in c.lower()]}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Data Quality Check"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Missing values\n",
"missing = df_train.isnull().sum()\n",
"missing_pct = missing / len(df_train) * 100\n",
"\n",
"print(\"Missing values:\")\n",
"print(f\" Columns with missing: {(missing > 0).sum()}\")\n",
"if (missing > 0).sum() > 0:\n",
" print(\"\\nTop columns by missing %:\")\n",
" print(missing_pct[missing_pct > 0].sort_values(ascending=False).head(10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Data coverage by date\n",
"df_train['datetime'] = pd.to_datetime(df_train.index.get_level_values(0))\n",
"df_train['instrument'] = df_train.index.get_level_values(1)\n",
"\n",
"daily_counts = df_train.groupby('datetime')['instrument'].nunique()\n",
"\n",
"fig, ax = plt.subplots(figsize=(14, 4))\n",
"daily_counts.plot(ax=ax)\n",
"ax.set_title('Number of Instruments per Day')\n",
"ax.set_xlabel('Date')\n",
"ax.set_ylabel('Instrument Count')\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"print(f\"\\nInstruments per day: {daily_counts.mean():.0f} avg, {daily_counts.min()}-{daily_counts.max()} range\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Target Analysis\n",
"\n",
"Analyze the 15-minute return target distribution."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Identify target column\n",
"target_col = [c for c in df_train.columns if 'return' in c.lower()][0]\n",
"print(f\"Target column: {target_col}\")\n",
"\n",
"# Target statistics\n",
"print(f\"\\nTarget statistics:\")\n",
"print(df_train[target_col].describe())\n",
"\n",
"print(f\"\\nSkewness: {df_train[target_col].skew():.3f}\")\n",
"print(f\"Kurtosis: {df_train[target_col].kurtosis():.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Target distribution\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
"\n",
"# Histogram\n",
"df_train[target_col].hist(bins=100, ax=axes[0], edgecolor='black', alpha=0.7)\n",
"axes[0].set_title(f'{target_col} Distribution')\n",
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
"axes[0].set_xlim(-0.05, 0.05) # Focus on main distribution\n",
"\n",
"# Time series of daily mean target\n",
"daily_mean_target = df_train.groupby('datetime')[target_col].mean()\n",
"axes[1].plot(daily_mean_target.index, daily_mean_target.values)\n",
"axes[1].set_title('Daily Mean Target')\n",
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Feature Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Feature statistics\n",
"feature_stats = df_train[feature_cols].describe().T\n",
"\n",
"print(\"Feature statistics summary:\")\n",
"print(f\" Mean range: [{feature_stats['mean'].min():.4f}, {feature_stats['mean'].max():.4f}]\")\n",
"print(f\" Std range: [{feature_stats['std'].min():.4f}, {feature_stats['std'].max():.4f}]\")\n",
"\n",
"# Check for features with extreme values\n",
"extreme_features = feature_stats[\n",
" (feature_stats['mean'].abs() > 10) | (feature_stats['std'] > 100)\n",
"]\n",
"if len(extreme_features) > 0:\n",
" print(f\"\\nFeatures with extreme values: {len(extreme_features)}\")\n",
" print(extreme_features.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Sample a few features for visualization\n",
"sample_features = feature_cols[:4]\n",
"\n",
"fig, axes = plt.subplots(2, 2, figsize=(12, 8))\n",
"axes = axes.flatten()\n",
"\n",
"for i, feat in enumerate(sample_features):\n",
" df_train[feat].hist(bins=100, ax=axes[i], edgecolor='black', alpha=0.7)\n",
" axes[i].set_title(feat)\n",
" axes[i].axvline(x=0, color='red', linestyle='--')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Sample Weights Analysis"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check weights if available\n",
"weight_cols = [c for c in df_train.columns if 'weight' in c.lower()]\n",
"if weight_cols:\n",
" weight_col = weight_cols[0]\n",
" print(f\"Weight column: {weight_col}\")\n",
" print(f\"\\nWeight statistics:\")\n",
" print(df_train[weight_col].describe())\n",
" \n",
" # Plot weight distribution by target sign\n",
" fig, ax = plt.subplots(figsize=(10, 4))\n",
" \n",
" positive_mask = df_train[target_col] > 0\n",
" df_train.loc[positive_mask, weight_col].hist(\n",
" bins=50, alpha=0.5, label='Positive target', ax=ax\n",
" )\n",
" df_train.loc[~positive_mask, weight_col].hist(\n",
" bins=50, alpha=0.5, label='Negative target', ax=ax\n",
" )\n",
" ax.set_title('Weight Distribution by Target Sign')\n",
" ax.legend()\n",
" plt.tight_layout()\n",
" plt.show()\n",
"else:\n",
" print(\"No weight column found\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
</thinking>
Let me now create the baseline model notebook for stock_15m:
<function=Write>
<parameter=content>{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Stock 15m Baseline Model\n",
"\n",
"Train and evaluate a baseline XGBoost model for 15-minute return prediction.\n",
"\n",
"**Purpose**: Establish baseline performance with standard configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import polars as pl\n",
"import matplotlib.pyplot as plt\n",
"import xgboost as xgb\n",
"from sklearn.metrics import r2_score\n",
"\n",
"from qshare.data.polars.ret15m import load_dataset\n",
"from qshare.io.polars import load_from_pq\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../')\n",
"from common.plotting import setup_plot_style\n",
"from common.paths import create_experiment_dir\n",
"\n",
"setup_plot_style()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = {\n",
" # Experiment\n",
" 'experiment_name': 'baseline_xgb',\n",
" 'save_results': True,\n",
" \n",
" # Data paths\n",
" 'path_a158': '/data/parquet/stock_1min_alpha158',\n",
" 'path_kline': '/data/parquet/stock_1min',\n",
" 'path_kline_daily': '/data/parquet/stock_1day',\n",
" 'path_industry': '/data/parquet/industry_idx',\n",
" \n",
" # Date ranges\n",
" 'dt_range': ['2022-01-01', '2024-12-31'],\n",
" 'train_range': ['2022-01-01', '2023-12-31'],\n",
" 'test_range': ['2024-01-01', '2024-12-31'],\n",
" \n",
" # Data config\n",
" 'normalization_mode': 'dual',\n",
" 'positive_factor': 1.0,\n",
" 'negative_factor': 2.0,\n",
" \n",
" # Model\n",
" 'model_params': {\n",
" 'objective': 'reg:squarederror',\n",
" 'eval_metric': 'rmse',\n",
" 'max_depth': 6,\n",
" 'learning_rate': 0.1,\n",
" 'n_estimators': 100,\n",
" 'subsample': 0.8,\n",
" 'colsample_bytree': 0.8,\n",
" 'random_state': 42,\n",
" },\n",
"}\n",
"\n",
"print(\"Configuration:\")\n",
"for key, value in CONFIG.items():\n",
" if not isinstance(value, dict):\n",
" print(f\" {key}: {value}\")\n",
"print(f\"\\nModel params: {CONFIG['model_params']}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Loading data sources...\")\n",
"\n",
"pl_ldf_a158 = load_from_pq(\n",
" path=CONFIG['path_a158'],\n",
" table_alias=\"a158\",\n",
" start_time=CONFIG['dt_range'][0],\n",
" as_struct=True\n",
")\n",
"\n",
"pl_ldf_kline = load_from_pq(\n",
" path=CONFIG['path_kline'],\n",
" table_alias=\"kline_1min\",\n",
" start_time=CONFIG['dt_range'][0],\n",
" as_struct=True\n",
")\n",
"\n",
"pl_ldf_kline_daily = load_from_pq(\n",
" path=CONFIG['path_kline_daily'],\n",
" table_alias=\"kline_1day\",\n",
" start_time=CONFIG['dt_range'][0],\n",
")\n",
"\n",
"pl_ldf_industry = load_from_pq(\n",
" path=CONFIG['path_industry'],\n",
" table_alias=\"indus_idx\",\n",
" start_time=CONFIG['dt_range'][0],\n",
")\n",
"\n",
"print(\"Loading dataset...\")\n",
"pl_df = load_dataset(\n",
" pl_ldf_a158_1min=pl_ldf_a158,\n",
" pl_ldf_kline_1min=pl_ldf_kline,\n",
" pl_ldf_kline_1day=pl_ldf_kline_daily,\n",
" pl_ldf_indus_idx=pl_ldf_industry,\n",
" dt_range=CONFIG['dt_range'],\n",
" normalization_mode=CONFIG['normalization_mode'],\n",
" negative_factor=CONFIG['negative_factor'],\n",
" positive_factor=CONFIG['positive_factor'],\n",
")\n",
"\n",
"# Convert to pandas\n",
"df_full = pl_df.to_pandas()\n",
"print(f\"\\nFull dataset shape: {df_full.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Prepare Train/Test Split"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Identify columns\n",
"feature_cols = [c for c in df_full.columns if c.startswith('alpha158_')]\n",
"target_cols = [c for c in df_full.columns if 'return' in c.lower()]\n",
"weight_cols = [c for c in df_full.columns if 'weight' in c.lower()]\n",
"\n",
"print(f\"Features: {len(feature_cols)}\")\n",
"print(f\"Targets: {target_cols}\")\n",
"print(f\"Weights: {weight_cols}\")\n",
"\n",
"# Select target\n",
"target_col = target_cols[0]\n",
"weight_col = weight_cols[0] if weight_cols else None\n",
"\n",
"# Split by date\n",
"df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n",
"df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n",
"\n",
"print(f\"\\nTrain: {df_train.shape}\")\n",
"print(f\"Test: {df_test.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Train Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Prepare data\n",
"X_train = df_train[feature_cols]\n",
"y_train = df_train[target_col]\n",
"w_train = df_train[weight_col] if weight_col else None\n",
"\n",
"X_test = df_test[feature_cols]\n",
"y_test = df_test[target_col]\n",
"\n",
"# Handle missing values\n",
"X_train = X_train.fillna(X_train.median())\n",
"X_test = X_test.fillna(X_train.median()) # Use train median\n",
"\n",
"print(\"Training XGBoost model...\")\n",
"print(f\" X shape: {X_train.shape}\")\n",
"print(f\" y mean: {y_train.mean():.6f}, std: {y_train.std():.6f}\")\n",
"\n",
"model = xgb.XGBRegressor(**CONFIG['model_params'])\n",
"\n",
"model.fit(\n",
" X_train, y_train,\n",
" sample_weight=w_train,\n",
" eval_set=[(X_test, y_test)],\n",
" verbose=False\n",
")\n",
"\n",
"print(\"Training complete!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Feature importance\n",
"importance = pd.DataFrame({\n",
" 'feature': feature_cols,\n",
" 'importance': model.feature_importances_\n",
"}).sort_values('importance', ascending=False)\n",
"\n",
"print(\"\\nTop 10 Features:\")\n",
"print(importance.head(10))\n",
"\n",
"# Plot\n",
"fig, ax = plt.subplots(figsize=(10, 6))\n",
"importance.head(20).plot(x='feature', y='importance', kind='barh', ax=ax)\n",
"ax.set_title('Top 20 Feature Importance')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Evaluate"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Generate predictions\n",
"y_pred_train = model.predict(X_train)\n",
"y_pred_test = model.predict(X_test)\n",
"\n",
"# Calculate metrics\n",
"train_r2 = r2_score(y_train, y_pred_train)\n",
"test_r2 = r2_score(y_test, y_pred_test)\n",
"\n",
"# IC (Information Coefficient)\n",
"train_ic = np.corrcoef(y_train, y_pred_train)[0, 1]\n",
"test_ic = np.corrcoef(y_test, y_pred_test)[0, 1]\n",
"\n",
"print(\"Performance Metrics:\")\n",
"print(f\" Train R2: {train_r2:.4f}\")\n",
"print(f\" Test R2: {test_r2:.4f}\")\n",
"print(f\" Train IC: {train_ic:.4f}\")\n",
"print(f\" Test IC: {test_ic:.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Daily IC analysis\n",
"df_test_eval = df_test.copy()\n",
"df_test_eval['pred'] = y_pred_test\n",
"df_test_eval['target'] = y_test\n",
"\n",
"df_test_eval['datetime'] = df_test_eval.index.get_level_values(0)\n",
"\n",
"# Calculate daily IC\n",
"daily_ic = df_test_eval.groupby('datetime').apply(\n",
" lambda x: x['target'].corr(x['pred'])\n",
")\n",
"\n",
"print(\"\\nDaily IC Statistics:\")\n",
"print(f\" Mean: {daily_ic.mean():.4f}\")\n",
"print(f\" Std: {daily_ic.std():.4f}\")\n",
"print(f\" IR: {daily_ic.mean() / daily_ic.std():.4f}\")\n",
"print(f\" >0: {(daily_ic > 0).mean():.1%}\")\n",
"\n",
"# Plot\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
"\n",
"# IC distribution\n",
"daily_ic.hist(bins=50, ax=axes[0], edgecolor='black')\n",
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
"axes[0].axvline(x=daily_ic.mean(), color='green', linestyle='--', label=f'Mean: {daily_ic.mean():.3f}')\n",
"axes[0].set_title('Daily IC Distribution')\n",
"axes[0].legend()\n",
"\n",
"# IC time series\n",
"daily_ic.rolling(20, min_periods=5).mean().plot(ax=axes[1])\n",
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
"axes[1].set_title('Rolling IC (20-day)')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Prediction vs Actual scatter\n",
"fig, ax = plt.subplots(figsize=(8, 8))\n",
"\n",
"# Sample for plotting\n",
"sample_idx = np.random.choice(len(y_test), size=min(10000, len(y_test)), replace=False)\n",
"ax.scatter(y_test.iloc[sample_idx], y_pred_test[sample_idx], alpha=0.3, s=1)\n",
"\n",
"# Perfect prediction line\n",
"lims = [min(y_test.min(), y_pred_test.min()), max(y_test.max(), y_pred_test.max())]\n",
"ax.plot(lims, lims, 'r--', alpha=0.5)\n",
"\n",
"ax.set_xlabel('Actual')\n",
"ax.set_ylabel('Predicted')\n",
"ax.set_title(f'Prediction vs Actual (IC={test_ic:.3f})')\n",
"ax.grid(True, alpha=0.3)\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Save Results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if CONFIG['save_results']:\n",
" import pickle\n",
" import json\n",
" \n",
" output_dir = create_experiment_dir('stock_15m', CONFIG['experiment_name'])\n",
" print(f\"Saving results to: {output_dir}\")\n",
" \n",
" # Save config\n",
" with open(output_dir / 'config.json', 'w') as f:\n",
" json.dump(CONFIG, f, indent=2, default=str)\n",
" \n",
" # Save model\n",
" with open(output_dir / 'model.pkl', 'wb') as f:\n",
" pickle.dump(model, f)\n",
" \n",
" # Save importance\n",
" importance.to_csv(output_dir / 'feature_importance.csv', index=False)\n",
" \n",
" # Save predictions\n",
" predictions = pd.DataFrame({\n",
" 'actual': y_test,\n",
" 'predicted': y_pred_test\n",
" }, index=df_test.index)\n",
" predictions.to_csv(output_dir / 'predictions.csv')\n",
" \n",
" # Save metrics\n",
" metrics = {\n",
" 'train_r2': float(train_r2),\n",
" 'test_r2': float(test_r2),\n",
" 'train_ic': float(train_ic),\n",
" 'test_ic': float(test_ic),\n",
" 'daily_ic_mean': float(daily_ic.mean()),\n",
" 'daily_ic_std': float(daily_ic.std()),\n",
" 'daily_ir': float(daily_ic.mean() / daily_ic.std()),\n",
" }\n",
" with open(output_dir / 'metrics.json', 'w') as f:\n",
" json.dump(metrics, f, indent=2)\n",
" \n",
" print(\"\\nFiles saved:\")\n",
" for f in output_dir.iterdir():\n",
" print(f\" - {f.name}\")\n",
"else:\n",
" print(\"Results not saved\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,257 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Stock 15m Baseline Model\n",
"\n",
"Train and evaluate a baseline XGBoost model for 15-minute return prediction.\n",
"\n",
"**Purpose**: Establish baseline performance with standard configuration."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import polars as pl\n",
"import matplotlib.pyplot as plt\n",
"import xgboost as xgb\n",
"from sklearn.metrics import r2_score\n",
"\n",
"from qshare.data.polars.ret15m import load_dataset\n",
"from qshare.io.polars import load_from_pq\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../')\n",
"from common.plotting import setup_plot_style\n",
"from common.paths import create_experiment_dir\n",
"\n",
"setup_plot_style()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = {\n",
" 'experiment_name': 'baseline_xgb',\n",
" 'save_results': True,\n",
" 'path_a158': '/data/parquet/stock_1min_alpha158',\n",
" 'path_kline': '/data/parquet/stock_1min',\n",
" 'path_kline_daily': '/data/parquet/stock_1day',\n",
" 'path_industry': '/data/parquet/industry_idx',\n",
" 'dt_range': ['2022-01-01', '2024-12-31'],\n",
" 'train_range': ['2022-01-01', '2023-12-31'],\n",
" 'test_range': ['2024-01-01', '2024-12-31'],\n",
" 'normalization_mode': 'dual',\n",
" 'positive_factor': 1.0,\n",
" 'negative_factor': 2.0,\n",
" 'model_params': {\n",
" 'objective': 'reg:squarederror',\n",
" 'eval_metric': 'rmse',\n",
" 'max_depth': 6,\n",
" 'learning_rate': 0.1,\n",
" 'n_estimators': 100,\n",
" 'subsample': 0.8,\n",
" 'colsample_bytree': 0.8,\n",
" 'random_state': 42,\n",
" },\n",
"}\n",
"\n",
"print('Configuration:')\n",
"for key, value in CONFIG.items():\n",
" if not isinstance(value, dict):\n",
" print(f' {key}: {value}')\n",
"print(f\"Model params: {CONFIG['model_params']}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('Loading data sources...')\n",
"\n",
"pl_ldf_a158 = load_from_pq(\n",
" path=CONFIG['path_a158'],\n",
" table_alias='a158',\n",
" start_time=CONFIG['dt_range'][0],\n",
" as_struct=True\n",
")\n",
"\n",
"pl_ldf_kline = load_from_pq(\n",
" path=CONFIG['path_kline'],\n",
" table_alias='kline_1min',\n",
" start_time=CONFIG['dt_range'][0],\n",
" as_struct=True\n",
")\n",
"\n",
"pl_ldf_kline_daily = load_from_pq(\n",
" path=CONFIG['path_kline_daily'],\n",
" table_alias='kline_1day',\n",
" start_time=CONFIG['dt_range'][0],\n",
")\n",
"\n",
"pl_ldf_industry = load_from_pq(\n",
" path=CONFIG['path_industry'],\n",
" table_alias='indus_idx',\n",
" start_time=CONFIG['dt_range'][0],\n",
")\n",
"\n",
"print('Loading dataset...')\n",
"pl_df = load_dataset(\n",
" pl_ldf_a158_1min=pl_ldf_a158,\n",
" pl_ldf_kline_1min=pl_ldf_kline,\n",
" pl_ldf_kline_1day=pl_ldf_kline_daily,\n",
" pl_ldf_indus_idx=pl_ldf_industry,\n",
" dt_range=CONFIG['dt_range'],\n",
" normalization_mode=CONFIG['normalization_mode'],\n",
" negative_factor=CONFIG['negative_factor'],\n",
" positive_factor=CONFIG['positive_factor'],\n",
")\n",
"\n",
"df_full = pl_df.to_pandas()\n",
"print(f'Full dataset shape: {df_full.shape}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Train/Test Split and Model Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"feature_cols = [c for c in df_full.columns if c.startswith('alpha158_')]\n",
"target_cols = [c for c in df_full.columns if 'return' in c.lower()]\n",
"weight_cols = [c for c in df_full.columns if 'weight' in c.lower()]\n",
"\n",
"target_col = target_cols[0]\n",
"weight_col = weight_cols[0] if weight_cols else None\n",
"\n",
"df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n",
"df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n",
"\n",
"print(f'Train: {df_train.shape}, Test: {df_test.shape}')\n",
"print(f'Features: {len(feature_cols)}, Target: {target_col}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train = df_train[feature_cols].fillna(df_train[feature_cols].median())\n",
"y_train = df_train[target_col]\n",
"w_train = df_train[weight_col] if weight_col else None\n",
"\n",
"X_test = df_test[feature_cols].fillna(df_train[feature_cols].median())\n",
"y_test = df_test[target_col]\n",
"\n",
"print('Training XGBoost...')\n",
"model = xgb.XGBRegressor(**CONFIG['model_params'])\n",
"model.fit(X_train, y_train, sample_weight=w_train, verbose=False)\n",
"print('Training complete!')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Evaluation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_pred_test = model.predict(X_test)\n",
"\n",
"test_r2 = r2_score(y_test, y_pred_test)\n",
"test_ic = np.corrcoef(y_test, y_pred_test)[0, 1]\n",
"\n",
"print(f'Test R2: {test_r2:.4f}')\n",
"print(f'Test IC: {test_ic:.4f}')\n",
"\n",
"# Daily IC\n",
"df_test_eval = df_test.copy()\n",
"df_test_eval['pred'] = y_pred_test\n",
"df_test_eval['target'] = y_test\n",
"df_test_eval['datetime'] = df_test_eval.index.get_level_values(0)\n",
"\n",
"daily_ic = df_test_eval.groupby('datetime').apply(\n",
" lambda x: x['target'].corr(x['pred'])\n",
")\n",
"\n",
"print(f'Daily IC Mean: {daily_ic.mean():.4f}')\n",
"print(f'Daily IC Std: {daily_ic.std():.4f}')\n",
"print(f'IR: {daily_ic.mean() / daily_ic.std():.4f}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot daily IC\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
"\n",
"daily_ic.hist(bins=50, ax=axes[0], edgecolor='black')\n",
"axes[0].axvline(x=daily_ic.mean(), color='green', linestyle='--')\n",
"axes[0].set_title('Daily IC Distribution')\n",
"\n",
"daily_ic.rolling(20, min_periods=5).mean().plot(ax=axes[1])\n",
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
"axes[1].set_title('Rolling IC (20-day)')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,25 @@
# Stock 15-Minute Return Prediction
Experiments for predicting stock 15-minute returns using high-frequency features.
## Data
- **Features**: alpha158 computed on 1-minute data
- **Target**: 15-minute forward returns (close[t+16]/close[t+1]-1)
- **Normalization**: industry, cs_zscore, or dual
## Notebooks
| Notebook | Purpose |
|----------|---------|
| `01_data_exploration.ipynb` | Load and explore 15m data structure |
| `02_baseline_model.ipynb` | Train baseline XGBoost model |
## Methodology
1. Load 1-minute kline data via Polars lazy frames
2. Compute/retrieve alpha158 features
3. Calculate 15-minute forward returns
4. Apply normalization (industry-neutralized or cross-sectional z-score)
5. Train gradient boosting models
6. Evaluate with IC and backtest

@ -0,0 +1,3 @@
"""Stock 15m task-specific utilities."""
# Add task-specific functions here as needed
Loading…
Cancel
Save