commit cdf63733256bfe51fd52bf907785e9fd4b0b2fab
Author: guofu
Date: Fri Feb 13 11:17:45 2026 +0800
Initial alpha_lab structure\n\n- Notebook-centric experiment framework\n- CTA 1D and Stock 15m tasks\n- Minimal common utilities\n- Manual experiment tracking
diff --git a/.env.template b/.env.template
new file mode 100644
index 0000000..89ce364
--- /dev/null
+++ b/.env.template
@@ -0,0 +1,14 @@
+# Alpha Lab Environment Configuration
+# Copy this file to .env and fill in your values
+
+# DolphinDB Configuration
+DDB_HOST=192.168.1.146
+DDB_PORT=8848
+DDB_USERNAME=
+DDB_PASSWORD=
+
+# Data Paths
+DATA_ROOT=/data/parquet
+
+# Experiment Output
+RESULTS_ROOT=/home/guofu/Workspaces/alpha_lab/results
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..dd4c38b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,51 @@
+# Environment
+.env
+.venv/
+env/
+venv/
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Jupyter
+.ipynb_checkpoints/
+*.ipynb_checkpoints
+
+# Results and data
+results/*
+!results/*/.gitkeep
+!results/*/README.md
+*.parquet
+*.pkl
+*.h5
+*.feather
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..8deffe4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,87 @@
+# Alpha Lab
+
+Quantitative research experiments for qshare library. This repository contains Jupyter notebooks and analysis scripts for exploring trading strategies and machine learning models.
+
+## Philosophy
+
+- **Notebook-centric**: Experiments are interactive notebooks, not rigid scripts
+- **Minimal abstraction**: Simple functions over complex class hierarchies
+- **Self-contained**: Each task directory is independent
+- **Ad-hoc friendly**: Easy to modify for exploration
+
+## Structure
+
+```
+alpha_lab/
+├── common/ # Shared utilities (keep minimal!)
+│ ├── paths.py # Path management
+│ └── plotting.py # Common plotting functions
+│
+├── cta_1d/ # CTA 1-day return prediction
+│ ├── 01_data_check.ipynb
+│ ├── 02_label_analysis.ipynb
+│ ├── 03_baseline_xgb.ipynb
+│ ├── 04_blend_comparison.ipynb
+│ └── src/ # Task-specific helpers
+│
+├── stock_15m/ # Stock 15-minute return prediction
+│ ├── 01_data_exploration.ipynb
+│ ├── 02_baseline_model.ipynb
+│ └── src/
+│
+└── results/ # Output directory (gitignored)
+ ├── cta_1d/
+ └── stock_15m/
+```
+
+## Setup
+
+```bash
+# Install dependencies
+pip install -r requirements.txt
+
+# Create environment file
+cp .env.template .env
+# Edit .env with your settings
+```
+
+## Usage
+
+Start Jupyter and run notebooks interactively:
+
+```bash
+jupyter notebook
+```
+
+Each task directory contains numbered notebooks:
+- `01_*.ipynb` - Data loading and exploration
+- `02_*.ipynb` - Analysis and baseline models
+- `03_*.ipynb` - Advanced experiments
+- `04_*.ipynb` - Comparisons and ablations
+
+## Experiment Tracking
+
+Experiments are tracked manually in `results/{task}/README.md`:
+
+```markdown
+## 2025-01-15: Baseline XGB
+- Notebook: `cta_1d/03_baseline_xgb.ipynb` (cells 1-50)
+- Config: eta=0.5, lambda=0.1
+- Train IC: 0.042
+- Test IC: 0.038
+- Notes: Dual normalization, 4 trades/day
+```
+
+## Adding a New Task
+
+1. Create directory: `mkdir my_task`
+2. Add `src/` subdirectory for helpers
+3. Create numbered notebooks
+4. Add entry to `results/my_task/README.md`
+
+## Best Practices
+
+1. **Keep it simple**: Only add to `common/` after 3+ copies
+2. **Notebook configs**: Define CONFIG dict in first cell for easy modification
+3. **Document results**: Update results README after significant runs
+4. **Git discipline**: Don't commit large files, results, or credentials
diff --git a/common/__init__.py b/common/__init__.py
new file mode 100644
index 0000000..c749aa7
--- /dev/null
+++ b/common/__init__.py
@@ -0,0 +1,13 @@
+"""Common utilities for alpha_lab experiments."""
+
+from .paths import ensure_dir, get_results_dir, get_task_results_dir
+from .plotting import setup_plot_style, plot_ic_series, plot_cumulative_returns
+
+__all__ = [
+ 'ensure_dir',
+ 'get_results_dir',
+ 'get_task_results_dir',
+ 'setup_plot_style',
+ 'plot_ic_series',
+ 'plot_cumulative_returns',
+]
diff --git a/common/paths.py b/common/paths.py
new file mode 100644
index 0000000..63d2cd7
--- /dev/null
+++ b/common/paths.py
@@ -0,0 +1,42 @@
+"""Path utilities for experiment outputs."""
+
+import os
+from pathlib import Path
+from datetime import datetime
+
+# Base directories
+BASE_DIR = Path(__file__).parent.parent
+RESULTS_DIR = BASE_DIR / "results"
+
+
+def ensure_dir(path: Path) -> Path:
+ """Create directory if it doesn't exist."""
+ path.mkdir(parents=True, exist_ok=True)
+ return path
+
+
+def get_results_dir() -> Path:
+ """Get base results directory."""
+ return ensure_dir(RESULTS_DIR)
+
+
+def get_task_results_dir(task_name: str) -> Path:
+ """Get results directory for a specific task."""
+ return ensure_dir(RESULTS_DIR / task_name)
+
+
+def create_experiment_dir(task_name: str, experiment_name: str | None = None) -> Path:
+ """Create a timestamped directory for an experiment.
+
+ Args:
+ task_name: Name of the task (e.g., 'cta_1d', 'stock_15m')
+ experiment_name: Optional experiment name (default: timestamp)
+
+ Returns:
+ Path to the created directory
+ """
+ if experiment_name is None:
+ experiment_name = datetime.now().strftime('%Y%m%d_%H%M%S')
+
+ exp_dir = RESULTS_DIR / task_name / experiment_name
+ return ensure_dir(exp_dir)
diff --git a/common/plotting.py b/common/plotting.py
new file mode 100644
index 0000000..36e468a
--- /dev/null
+++ b/common/plotting.py
@@ -0,0 +1,119 @@
+"""Common plotting utilities for experiments."""
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+import pandas as pd
+
+
+def setup_plot_style():
+ """Set up default plotting style."""
+ plt.style.use('seaborn-v0_8-whitegrid')
+ sns.set_palette("husl")
+ plt.rcParams['figure.figsize'] = (12, 6)
+ plt.rcParams['font.size'] = 10
+
+
+def plot_ic_series(ic_by_date: pd.Series, title: str = "IC Over Time",
+ figsize: tuple = (14, 4)) -> plt.Figure:
+ """Plot IC time series with rolling mean.
+
+ Args:
+ ic_by_date: Series with datetime index and IC values
+ title: Plot title
+ figsize: Figure size
+
+ Returns:
+ Matplotlib figure
+ """
+ fig, ax = plt.subplots(figsize=figsize)
+
+ # Plot raw IC
+ ax.plot(ic_by_date.index, ic_by_date.values, alpha=0.5, color='gray', label='Daily IC')
+
+ # Plot rolling mean
+ rolling = ic_by_date.rolling(20, min_periods=5).mean()
+ ax.plot(rolling.index, rolling.values, color='blue', linewidth=2, label='20-day MA')
+
+ # Add mean line
+ mean_ic = ic_by_date.mean()
+ ax.axhline(y=mean_ic, color='red', linestyle='--',
+ label=f'Mean IC: {mean_ic:.4f}')
+ ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)
+
+ ax.set_title(title)
+ ax.set_xlabel('Date')
+ ax.set_ylabel('Information Coefficient')
+ ax.legend(loc='upper right')
+
+ plt.tight_layout()
+ return fig
+
+
+def plot_cumulative_returns(returns: pd.Series, title: str = "Cumulative Returns",
+ figsize: tuple = (12, 6)) -> plt.Figure:
+ """Plot cumulative returns.
+
+ Args:
+ returns: Series with datetime index and daily returns
+ title: Plot title
+ figsize: Figure size
+
+ Returns:
+ Matplotlib figure
+ """
+ fig, ax = plt.subplots(figsize=figsize)
+
+ cumulative = (1 + returns).cumprod()
+ ax.plot(cumulative.index, cumulative.values, linewidth=1.5)
+
+ ax.set_title(title)
+ ax.set_xlabel('Date')
+ ax.set_ylabel('Cumulative Return')
+ ax.set_yscale('log')
+
+ # Add final return annotation
+ final_return = cumulative.iloc[-1] - 1
+ ax.annotate(f'{final_return:.2%}',
+ xy=(cumulative.index[-1], cumulative.iloc[-1]),
+ xytext=(10, 0), textcoords='offset points',
+ fontsize=10, color='green' if final_return > 0 else 'red')
+
+ plt.tight_layout()
+ return fig
+
+
+def plot_factor_distribution(factor: pd.Series, title: str = "Factor Distribution",
+ figsize: tuple = (10, 6)) -> plt.Figure:
+ """Plot factor distribution with statistics.
+
+ Args:
+ factor: Series of factor values
+ title: Plot title
+ figsize: Figure size
+
+ Returns:
+ Matplotlib figure
+ """
+ fig, axes = plt.subplots(1, 2, figsize=figsize)
+
+ # Histogram
+ axes[0].hist(factor.dropna(), bins=100, alpha=0.7, edgecolor='black')
+ axes[0].set_title(f'{title} - Distribution')
+ axes[0].set_xlabel('Value')
+ axes[0].set_ylabel('Frequency')
+
+ # Q-Q plot
+ from scipy import stats
+ stats.probplot(factor.dropna(), dist="norm", plot=axes[1])
+ axes[1].set_title(f'{title} - Q-Q Plot')
+
+ # Add statistics text
+ stats_text = f"Mean: {factor.mean():.4f}\nStd: {factor.std():.4f}\n"
+ stats_text += f"Skew: {factor.skew():.4f}\nKurt: {factor.kurtosis():.4f}"
+ axes[0].text(0.95, 0.95, stats_text, transform=axes[0].transAxes,
+ verticalalignment='top', horizontalalignment='right',
+ bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
+
+ plt.tight_layout()
+ return fig
diff --git a/cta_1d/01_data_check.ipynb b/cta_1d/01_data_check.ipynb
new file mode 100644
index 0000000..6e42050
--- /dev/null
+++ b/cta_1d/01_data_check.ipynb
@@ -0,0 +1,277 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CTA 1D Data Check\n",
+ "\n",
+ "Load and validate CTA futures data.\n",
+ "\n",
+ "**Purpose**: Verify data availability, check basic statistics, and understand data structure before modeling."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "from qshare.data.pandas.cta_1d import load_dataset\n",
+ "from qshare.io.ddb.cta import load_cta_alpha158, load_cta_hffactors, load_cta_returns\n",
+ "\n",
+ "import sys\n",
+ "sys.path.insert(0, '../')\n",
+ "from common.plotting import setup_plot_style\n",
+ "\n",
+ "setup_plot_style()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Configuration\n",
+ "\n",
+ "Modify these parameters as needed for your data check."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CONFIG = {\n",
+ " 'dt_range': ['2020-01-01', '2024-12-31'],\n",
+ " 'feature_sets': ['alpha158', 'hffactor'],\n",
+ " 'return_type': 'o2c_twap1min', # or 'o2o_twap1min'\n",
+ " 'normalization': 'dual',\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Load Features Separately\n",
+ "\n",
+ "Check each feature set independently."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load alpha158 features\n",
+ "print(\"Loading alpha158 features...\")\n",
+ "df_alpha158 = load_cta_alpha158(\n",
+ " since_date=CONFIG['dt_range'][0],\n",
+ " end_date=CONFIG['dt_range'][1],\n",
+ ")\n",
+ "print(f\"alpha158 shape: {df_alpha158.shape}\")\n",
+ "print(f\"\")\n",
+ "print(f\"Columns: {list(df_alpha158.columns[:10])}...\") # First 10 columns\n",
+ "print(f\"\")\n",
+ "print(f\"Date range: {df_alpha158.index.get_level_values(0).min()} to {df_alpha158.index.get_level_values(0).max()}\")\n",
+ "print(f\"Instruments: {df_alpha158.index.get_level_values(1).nunique()}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load HF factors\n",
+ "print(\"Loading hffactor features...\")\n",
+ "df_hf = load_cta_hffactors(\n",
+ " since_date=CONFIG['dt_range'][0],\n",
+ " end_date=CONFIG['dt_range'][1],\n",
+ ")\n",
+ "print(f\"hffactor shape: {df_hf.shape}\")\n",
+ "print(f\"\")\n",
+ "print(f\"Columns: {list(df_hf.columns[:10])}...\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Load Returns (Labels)\n",
+ "\n",
+ "Check return indicators that will be used as prediction targets."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load return indicators\n",
+ "print(\"Loading return indicators...\")\n",
+ "df_returns = load_cta_returns(\n",
+ " since_date=CONFIG['dt_range'][0],\n",
+ " end_date=CONFIG['dt_range'][1],\n",
+ ")\n",
+ "print(f\"Returns shape: {df_returns.shape}\")\n",
+ "print(f\"\")\n",
+ "print(f\"Available return types:\")\n",
+ "for col in df_returns.columns:\n",
+ " print(f\" - {col}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check specific return type\n",
+ "return_col = CONFIG['return_type']\n",
+ "if return_col in df_returns.columns:\n",
+ " print(f\"\\n{return_col} statistics:\")\n",
+ " print(df_returns[return_col].describe())\n",
+ " \n",
+ " # Plot distribution\n",
+ " fig, ax = plt.subplots(figsize=(10, 4))\n",
+ " df_returns[return_col].hist(bins=100, ax=ax, edgecolor='black')\n",
+ " ax.set_title(f'{return_col} Distribution')\n",
+ " ax.axvline(x=0, color='red', linestyle='--')\n",
+ " plt.show()\n",
+ "else:\n",
+ " print(f\"Warning: {return_col} not found in returns data\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Load Full Dataset\n",
+ "\n",
+ "Load the complete training dataset with features and labels."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load full dataset\n",
+ "print(\"Loading full dataset...\")\n",
+ "df_full = load_dataset(\n",
+ " dt_range=CONFIG['dt_range'],\n",
+ " return_type=CONFIG['return_type'],\n",
+ " normalization=CONFIG['normalization'],\n",
+ " feature_sets=CONFIG['feature_sets'],\n",
+ ")\n",
+ "\n",
+ "print(f\"\\nFull dataset shape: {df_full.shape}\")\n",
+ "print(f\"\")\n",
+ "print(f\"Columns: {len(df_full.columns)} total\")\n",
+ "print(f\" - Features: {len([c for c in df_full.columns if c.startswith(('alpha158_', 'hf_'))])}\")\n",
+ "print(f\" - Label: 'label'\")\n",
+ "print(f\" - Weight: 'weight'\")\n",
+ "print(f\" - Return: 'return'\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check for missing values\n",
+ "missing = df_full.isnull().sum()\n",
+ "missing_cols = missing[missing > 0]\n",
+ "\n",
+ "if len(missing_cols) > 0:\n",
+ " print(f\"\\nColumns with missing values:\")\n",
+ " print(missing_cols.head(10))\n",
+ "else:\n",
+ " print(\"\\nNo missing values found!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "source": [
+ "# Label statistics\n",
+ "print(\"\\nLabel statistics:\")\n",
+ "print(df_full['label'].describe())\n",
+ "\n",
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
+ "\n",
+ "# Distribution\n",
+ "df_full['label'].hist(bins=100, ax=axes[0], edgecolor='black')\n",
+ "axes[0].set_title('Label Distribution')\n",
+ "axes[0].axvline(x=0, color='red', linestyle='--')\n",
+ "\n",
+ "# Time series of mean label by date\n",
+ "label_by_date = df_full.groupby(level=0)['label'].mean()\n",
+ "axes[1].plot(label_by_date.index, label_by_date.values)\n",
+ "axes[1].set_title('Mean Label by Date')\n",
+ "axes[1].axhline(y=0, color='red', linestyle='--')\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Summary\n",
+ "\n",
+ "Check data availability by instrument and date."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Data availability heatmap\n",
+ "available = df_full.groupby([df_full.index.get_level_values(0).date, df_full.index.get_level_values(1)]).size().unstack(fill_value=0)\n",
+ "available = (available > 0).astype(int)\n",
+ "\n",
+ "print(f\"Data availability: {available.sum().sum()} instrument-date pairs\")\n",
+ "print(f\"Instruments: {len(available.columns)}\")\n",
+ "print(f\"Dates: {len(available.index)}\")\n",
+ "\n",
+ "# Plot coverage\n",
+ "fig, ax = plt.subplots(figsize=(14, 6))\n",
+ "im = ax.imshow(available.T.values, aspect='auto', cmap='RdYlGn', interpolation='nearest')\n",
+ "ax.set_title('Data Availability (Green=Available, Red=Missing)')\n",
+ "ax.set_xlabel('Time')\n",
+ "ax.set_ylabel('Instrument')\n",
+ "plt.colorbar(im, ax=ax)\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.8.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/cta_1d/02_label_analysis.ipynb b/cta_1d/02_label_analysis.ipynb
new file mode 100644
index 0000000..ff6ac29
--- /dev/null
+++ b/cta_1d/02_label_analysis.ipynb
@@ -0,0 +1,319 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CTA 1D Label Analysis\n",
+ "\n",
+ "Explore label distributions and compare different normalization blending strategies.\n",
+ "\n",
+ "**Purpose**: Understand how different normalization methods affect label distributions and identify optimal blending."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "from qshare.data.pandas.cta_1d.dataset import load_features, load_label\n",
+ "from qshare.data.pandas.cta_1d.label import normalize_label_dual, normalize_label\n",
+ "from qshare.io.ddb.cta import load_cta_returns\n",
+ "\n",
+ "import sys\n",
+ "sys.path.insert(0, '../')\n",
+ "from common.plotting import setup_plot_style\n",
+ "from src.labels import BLEND_CONFIGS, get_blend_weights, describe_blend_config\n",
+ "\n",
+ "setup_plot_style()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CONFIG = {\n",
+ " 'dt_range': ['2020-01-01', '2024-12-31'],\n",
+ " 'fit_range': ['2020-01-01', '2021-12-31'], # For zscore normalization\n",
+ " 'return_type': 'o2c_twap1min',\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Load Raw Returns\n",
+ "\n",
+ "Load the raw return series before any normalization."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load returns\n",
+ "print(\"Loading raw returns...\")\n",
+ "df_returns = load_cta_returns(\n",
+ " since_date=CONFIG['dt_range'][0],\n",
+ " end_date=CONFIG['dt_range'][1],\n",
+ ")\n",
+ "\n",
+ "return_col = CONFIG['return_type']\n",
+ "raw_returns = df_returns[return_col].copy()\n",
+ "\n",
+ "print(f\"\\nRaw {return_col} returns:\")\n",
+ "print(raw_returns.describe())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Plot raw return distribution\n",
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
+ "\n",
+ "# Histogram\n",
+ "raw_returns.hist(bins=100, ax=axes[0], edgecolor='black')\n",
+ "axes[0].set_title(f'Raw {return_col} Distribution')\n",
+ "axes[0].axvline(x=0, color='red', linestyle='--')\n",
+ "\n",
+ "# Time series\n",
+ "daily_mean = raw_returns.groupby(level=0).mean()\n",
+ "axes[1].plot(daily_mean.index, daily_mean.values)\n",
+ "axes[1].set_title('Daily Mean Return')\n",
+ "axes[1].axhline(y=0, color='red', linestyle='--')\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Compare Normalization Methods\n",
+ "\n",
+ "Apply each normalization method individually and compare."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load dominant contract mapping for proper label construction\n",
+ "from qshare.io.ddb.cta import load_cta_dominant_contracts\n",
+ "\n",
+ "print(\"Loading dominant contract mapping...\")\n",
+ "df_dominant = load_cta_dominant_contracts(\n",
+ " since_date=CONFIG['dt_range'][0],\n",
+ " end_date=CONFIG['dt_range'][1],\n",
+ ")\n",
+ "\n",
+ "# Merge returns with dominant mapping\n",
+ "df_merged = df_dominant.join(raw_returns, how='left')\n",
+ "\n",
+ "# Calculate different normalization methods\n",
+ "print(\"\\nApplying normalization methods...\")\n",
+ "\n",
+ "norm_results = {}\n",
+ "\n",
+ "# zscore (fit-time)\n",
+ "norm_results['zscore'] = normalize_label(\n",
+ " df_merged[return_col],\n",
+ " method='zscore',\n",
+ " fit_range=CONFIG['fit_range']\n",
+ ")\n",
+ "\n",
+ "# cs_zscore (cross-sectional)\n",
+ "norm_results['cs_zscore'] = df_merged.groupby(level=0)[return_col].apply(\n",
+ " lambda x: (x - x.mean()) / (x.std() + 1e-8)\n",
+ ")\n",
+ "\n",
+ "# rolling_20\n",
+ "norm_results['rolling_20'] = normalize_label(\n",
+ " df_merged[return_col],\n",
+ " method='rolling',\n",
+ " window=20\n",
+ ")\n",
+ "\n",
+ "# rolling_60\n",
+ "norm_results['rolling_60'] = normalize_label(\n",
+ " df_merged[return_col],\n",
+ " method='rolling',\n",
+ " window=60\n",
+ ")\n",
+ "\n",
+ "print(\"Done!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Compare distributions\n",
+ "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
+ "axes = axes.flatten()\n",
+ "\n",
+ "for i, (method, series) in enumerate(norm_results.items()):\n",
+ " ax = axes[i]\n",
+ " series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n",
+ " ax.set_title(f'{method}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n",
+ " ax.axvline(x=0, color='red', linestyle='--')\n",
+ " ax.set_xlim(-5, 5) # Focus on main distribution\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Compare Blend Configurations\n",
+ "\n",
+ "Compare different blending strategies."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Apply each blend configuration\n",
+ "blend_results = {}\n",
+ "\n",
+ "for name in BLEND_CONFIGS.keys():\n",
+ " weights = get_blend_weights(name)\n",
+ " print(f\"\\nProcessing {name}: {weights}\")\n",
+ " \n",
+ " # Calculate blended label\n",
+ " blended = (\n",
+ " weights[0] * norm_results['zscore'] +\n",
+ " weights[1] * norm_results['cs_zscore'] +\n",
+ " weights[2] * norm_results['rolling_20'] +\n",
+ " weights[3] * norm_results['rolling_60']\n",
+ " )\n",
+ " \n",
+ " blend_results[name] = blended\n",
+ " print(f\" Mean: {blended.mean():.4f}, Std: {blended.std():.4f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Visualize all blend distributions\n",
+ "fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n",
+ "axes = axes.flatten()\n",
+ "\n",
+ "for i, (name, series) in enumerate(blend_results.items()):\n",
+ " ax = axes[i]\n",
+ " series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n",
+ " weights = get_blend_weights(name)\n",
+ " ax.set_title(f'{name}\\nweights={weights}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n",
+ " ax.axvline(x=0, color='red', linestyle='--')\n",
+ " ax.set_xlim(-5, 5)\n",
+ "\n",
+ "# Hide last subplot if not used\n",
+ "if len(blend_results) < 6:\n",
+ " axes[-1].axis('off')\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Correlation Analysis\n",
+ "\n",
+ "Check correlations between different normalization methods."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create comparison DataFrame\n",
+ "comparison_df = pd.DataFrame(norm_results)\n",
+ "\n",
+ "# Add raw returns\n",
+ "comparison_df['raw'] = df_merged[return_col]\n",
+ "\n",
+ "# Calculate correlation matrix\n",
+ "corr = comparison_df.corr()\n",
+ "\n",
+ "# Plot heatmap\n",
+ "fig, ax = plt.subplots(figsize=(8, 6))\n",
+ "sns.heatmap(corr, annot=True, cmap='RdBu_r', center=0,\n",
+ " vmin=-1, vmax=1, ax=ax)\n",
+ "ax.set_title('Correlation: Normalization Methods')\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "source": [
+ "# Rolling correlation analysis\n",
+ "window = 60\n",
+ "\n",
+ "# Calculate rolling correlation between zscore and cs_zscore\n",
+ "rolling_corr = norm_results['zscore'].rolling(window).corr(norm_results['cs_zscore'])\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(14, 4))\n",
+ "ax.plot(rolling_corr.index.get_level_values(0).unique(), rolling_corr.groupby(level=0).mean())\n",
+ "ax.set_title(f'Rolling Correlation: zscore vs cs_zscore ({window}d window)')\n",
+ "ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.5)\n",
+ "ax.set_ylim(-1, 1)\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.8.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/cta_1d/03_baseline_xgb.ipynb b/cta_1d/03_baseline_xgb.ipynb
new file mode 100644
index 0000000..d77d3ca
--- /dev/null
+++ b/cta_1d/03_baseline_xgb.ipynb
@@ -0,0 +1,356 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CTA 1D Baseline XGBoost Model\n",
+ "\n",
+ "Train and evaluate a baseline XGBoost model for CTA 1-day return prediction.\n",
+ "\n",
+ "**Purpose**: Establish a baseline performance benchmark with standard configuration."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import json\n",
+ "from datetime import datetime\n",
+ "\n",
+ "from qshare.data.pandas.cta_1d import load_dataset\n",
+ "from qshare.algo.learning.cta_trainer import CTAXGBTrainer\n",
+ "from qshare.eval.cta.backtest import CTABacktester\n",
+ "\n",
+ "import sys\n",
+ "sys.path.insert(0, '../')\n",
+ "from common.plotting import setup_plot_style, plot_ic_series, plot_cumulative_returns\n",
+ "from common.paths import create_experiment_dir\n",
+ "from src.labels import get_blend_weights, describe_blend_config\n",
+ "\n",
+ "setup_plot_style()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Configuration\n",
+ "\n",
+ "Edit this cell to modify experiment parameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CONFIG = {\n",
+ " # Experiment\n",
+ " 'experiment_name': 'baseline_xgb', # Will be appended with timestamp\n",
+ " \n",
+ " # Date ranges\n",
+ " 'dt_range': ['2020-01-01', '2024-12-31'],\n",
+ " 'train_range': ['2020-01-01', '2022-12-31'],\n",
+ " 'test_range': ['2023-01-01', '2024-12-31'],\n",
+ " 'fit_range': ['2020-01-01', '2021-06-30'], # For normalization fitting\n",
+ " \n",
+ " # Data\n",
+ " 'feature_sets': ['alpha158', 'hffactor'],\n",
+ " 'return_type': 'o2c_twap1min',\n",
+ " 'normalization': 'dual',\n",
+ " 'blend_weights': None, # Use default [0.2, 0.1, 0.3, 0.4] or specify name/list\n",
+ " 'weight_factors': {'positive': 1.0, 'negative': 2.0},\n",
+ " \n",
+ " # Model\n",
+ " 'xgb_params': {\n",
+ " 'booster': 'gblinear',\n",
+ " 'eta': 0.5,\n",
+ " 'lambda_reg': 0.1,\n",
+ " 'num_round': 20,\n",
+ " },\n",
+ " \n",
+ " # Backtest\n",
+ " 'backtest_params': {\n",
+ " 'num_trades': 4,\n",
+ " 'signal_dist': 'normal',\n",
+ " 'pos_weight': True,\n",
+ " },\n",
+ " \n",
+ " # Output\n",
+ " 'save_results': True,\n",
+ "}\n",
+ "\n",
+ "print(\"Configuration:\")\n",
+ "print(f\" Experiment: {CONFIG['experiment_name']}\")\n",
+ "print(f\" Train: {CONFIG['train_range'][0]} to {CONFIG['train_range'][1]}\")\n",
+ "print(f\" Test: {CONFIG['test_range'][0]} to {CONFIG['test_range'][1]}\")\n",
+ "print(f\" Blend: {describe_blend_config(CONFIG['blend_weights'] or 'default')}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Load Dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Loading dataset...\")\n",
+ "df_full = load_dataset(\n",
+ " dt_range=CONFIG['dt_range'],\n",
+ " return_type=CONFIG['return_type'],\n",
+ " normalization=CONFIG['normalization'],\n",
+ " feature_sets=CONFIG['feature_sets'],\n",
+ " fit_range=CONFIG['fit_range'],\n",
+ " weight_factors=CONFIG['weight_factors'],\n",
+ " blend_weights=CONFIG['blend_weights'],\n",
+ ")\n",
+ "\n",
+ "print(f\"\\nDataset shape: {df_full.shape}\")\n",
+ "print(f\"Columns: {len(df_full.columns)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Split train/test\n",
+ "df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n",
+ "df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n",
+ "\n",
+ "print(f\"Train: {df_train.shape}\")\n",
+ "print(f\"Test: {df_test.shape}\")\n",
+ "\n",
+ "# Get feature columns\n",
+ "feature_cols = [c for c in df_train.columns\n",
+ " if c.startswith(('alpha158_', 'hf_', 'f_'))]\n",
+ "print(f\"\\nFeatures: {len(feature_cols)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Train Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Training XGBoost model...\")\n",
+ "print(f\" Params: {CONFIG['xgb_params']}\")\n",
+ "\n",
+ "trainer = CTAXGBTrainer(**CONFIG['xgb_params'])\n",
+ "\n",
+ "trainer.fit(\n",
+ " df_train,\n",
+ " feature_cols=feature_cols,\n",
+ " target_col='label',\n",
+ " weight_col='weight'\n",
+ ")\n",
+ "\n",
+ "print(\"\\nTraining complete!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Feature importance\n",
+ "importance = trainer.get_feature_importance()\n",
+ "print(\"\\nTop 10 Features:\")\n",
+ "print(importance.head(10))\n",
+ "\n",
+ "# Plot\n",
+ "fig, ax = plt.subplots(figsize=(10, 6))\n",
+ "importance.head(20).plot(kind='barh', ax=ax)\n",
+ "ax.set_title('Top 20 Feature Importance')\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Generate Predictions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Generating predictions on test set...\")\n",
+ "df_signal = trainer.predict(df_test)\n",
+ "\n",
+ "print(f\"\\nSignal statistics:\")\n",
+ "print(df_signal.describe())\n",
+ "\n",
+ "# Plot signal distribution\n",
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
+ "\n",
+ "df_signal.hist(bins=100, ax=axes[0], edgecolor='black')\n",
+ "axes[0].set_title('Signal Distribution')\n",
+ "axes[0].axvline(x=0, color='red', linestyle='--')\n",
+ "\n",
+ "signal_by_date = df_signal.groupby(level=0).mean()\n",
+ "axes[1].plot(signal_by_date.index, signal_by_date.values)\n",
+ "axes[1].set_title('Mean Signal by Date')\n",
+ "axes[1].axhline(y=0, color='red', linestyle='--')\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Evaluate with Backtest"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Running backtest...\")\n",
+ "\n",
+ "returns = df_test['return'] if 'return' in df_test.columns else df_test['label']\n",
+ "\n",
+ "backtester = CTABacktester(**CONFIG['backtest_params'])\n",
+ "results = backtester.run(returns, df_signal)\n",
+ "\n",
+ "summary = backtester.summary()\n",
+ "print(\"\\nBacktest Summary:\")\n",
+ "for key, value in summary.items():\n",
+ " if isinstance(value, float):\n",
+ " print(f\" {key}: {value:.4f}\")\n",
+ " else:\n",
+ " print(f\" {key}: {value}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# IC Analysis\n",
+ "ic_by_date = results.groupby(results.index.get_level_values(0))['ic'].mean()\n",
+ "\n",
+ "fig = plot_ic_series(ic_by_date, title=\"IC Over Time (Test Set)\")\n",
+ "plt.show()\n",
+ "\n",
+ "print(f\"\\nIC Statistics:\")\n",
+ "print(f\" Mean: {ic_by_date.mean():.4f}\")\n",
+ "print(f\" Std: {ic_by_date.std():.4f}\")\n",
+ "print(f\" IR: {ic_by_date.mean() / ic_by_date.std():.4f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Cumulative returns\n",
+ "daily_returns = results.groupby(results.index.get_level_values(0))['pos_return'].mean()\n",
+ "\n",
+ "fig = plot_cumulative_returns(daily_returns, title=\"Cumulative Strategy Returns\")\n",
+ "plt.show()\n",
+ "\n",
+ "total_return = (1 + daily_returns).prod() - 1\n",
+ "annual_return = (1 + total_return) ** (252 / len(daily_returns)) - 1\n",
+ "sharpe = daily_returns.mean() / daily_returns.std() * np.sqrt(252)\n",
+ "\n",
+ "print(f\"\\nReturn Statistics:\")\n",
+ "print(f\" Total Return: {total_return:.2%}\")\n",
+ "print(f\" Annual Return: {annual_return:.2%}\")\n",
+ "print(f\" Sharpe Ratio: {sharpe:.2f}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Save Results\n",
+ "\n",
+ "Save model, predictions, and metrics for later analysis."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if CONFIG['save_results']:\n",
+ " # Create output directory\n",
+ " output_dir = create_experiment_dir('cta_1d', CONFIG['experiment_name'])\n",
+ " print(f\"Saving results to: {output_dir}\")\n",
+ " \n",
+ " # Save config\n",
+ " with open(output_dir / 'config.json', 'w') as f:\n",
+ " json.dump(CONFIG, f, indent=2, default=str)\n",
+ " \n",
+ " # Save model\n",
+ " trainer.save_model(str(output_dir / 'model.pkl'))\n",
+ " \n",
+ " # Save feature importance\n",
+ " importance.to_csv(output_dir / 'feature_importance.csv')\n",
+ " \n",
+ " # Save predictions\n",
+ " df_signal.to_csv(output_dir / 'predictions.csv')\n",
+ " \n",
+ " # Save backtest results\n",
+ " results.to_csv(output_dir / 'backtest_results.csv')\n",
+ " \n",
+ " # Save summary\n",
+ " with open(output_dir / 'summary.json', 'w') as f:\n",
+ " json.dump(summary, f, indent=2, default=str)\n",
+ " \n",
+ " print(\"\\nFiles saved:\")\n",
+ " for f in output_dir.iterdir():\n",
+ " print(f\" - {f.name}\")\n",
+ "else:\n",
+ " print(\"Results not saved (save_results=False)\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.8.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/cta_1d/04_blend_comparison.ipynb b/cta_1d/04_blend_comparison.ipynb
new file mode 100644
index 0000000..220a535
--- /dev/null
+++ b/cta_1d/04_blend_comparison.ipynb
@@ -0,0 +1,439 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# CTA 1D Blend Comparison\n",
+ "\n",
+ "Compare model performance across different label blending configurations.\n",
+ "\n",
+ "**Purpose**: Identify the optimal normalization blend for the CTA 1-day prediction task."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "\n",
+ "from qshare.data.pandas.cta_1d import load_dataset\n",
+ "from qshare.algo.learning.cta_trainer import CTAXGBTrainer\n",
+ "from qshare.eval.cta.backtest import CTABacktester\n",
+ "\n",
+ "import sys\n",
+ "sys.path.insert(0, '../')\n",
+ "from common.plotting import setup_plot_style, plot_ic_series\n",
+ "from src.labels import BLEND_CONFIGS, get_blend_weights\n",
+ "\n",
+ "setup_plot_style()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Configuration\n",
+ "\n",
+ "Define base configuration shared across all blend experiments."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "BASE_CONFIG = {\n",
+ " # Date ranges\n",
+ " 'dt_range': ['2020-01-01', '2024-12-31'],\n",
+ " 'train_range': ['2020-01-01', '2022-12-31'],\n",
+ " 'test_range': ['2023-01-01', '2024-12-31'],\n",
+ " 'fit_range': ['2020-01-01', '2021-06-30'],\n",
+ " \n",
+ " # Data\n",
+ " 'feature_sets': ['alpha158', 'hffactor'],\n",
+ " 'return_type': 'o2c_twap1min',\n",
+ " 'normalization': 'dual',\n",
+ " 'weight_factors': {'positive': 1.0, 'negative': 2.0},\n",
+ " \n",
+ " # Model (fixed for fair comparison)\n",
+ " 'xgb_params': {\n",
+ " 'booster': 'gblinear',\n",
+ " 'eta': 0.5,\n",
+ " 'lambda_reg': 0.1,\n",
+ " 'num_round': 20,\n",
+ " },\n",
+ " \n",
+ " # Backtest\n",
+ " 'backtest_params': {\n",
+ " 'num_trades': 4,\n",
+ " 'signal_dist': 'normal',\n",
+ " 'pos_weight': True,\n",
+ " },\n",
+ "}\n",
+ "\n",
+ "print(\"Blend configurations to compare:\")\n",
+ "for name, weights in BLEND_CONFIGS.items():\n",
+ " print(f\" {name}: {weights}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Run Experiments\n",
+ "\n",
+ "Train and evaluate a model for each blend configuration."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def run_single_experiment(blend_name, blend_weights):\n",
+ " \"\"\"Run experiment with specific blend configuration.\"\"\"\n",
+ " print(f\"\\n{'='*60}\")\n",
+ " print(f\"Running: {blend_name}\")\n",
+ " print(f\"Weights: {blend_weights}\")\n",
+ " print(f\"{'='*60}\")\n",
+ " \n",
+ " # Load data\n",
+ " df_full = load_dataset(\n",
+ " dt_range=BASE_CONFIG['dt_range'],\n",
+ " return_type=BASE_CONFIG['return_type'],\n",
+ " normalization=BASE_CONFIG['normalization'],\n",
+ " feature_sets=BASE_CONFIG['feature_sets'],\n",
+ " fit_range=BASE_CONFIG['fit_range'],\n",
+ " weight_factors=BASE_CONFIG['weight_factors'],\n",
+ " blend_weights=blend_weights,\n",
+ " )\n",
+ " \n",
+ " # Split\n",
+ " df_train = df_full.loc[BASE_CONFIG['train_range'][0]:BASE_CONFIG['train_range'][1]]\n",
+ " df_test = df_full.loc[BASE_CONFIG['test_range'][0]:BASE_CONFIG['test_range'][1]]\n",
+ " \n",
+ " # Features\n",
+ " feature_cols = [c for c in df_train.columns\n",
+ " if c.startswith(('alpha158_', 'hf_', 'f_'))]\n",
+ " \n",
+ " # Train\n",
+ " trainer = CTAXGBTrainer(**BASE_CONFIG['xgb_params'])\n",
+ " trainer.fit(\n",
+ " df_train,\n",
+ " feature_cols=feature_cols,\n",
+ " target_col='label',\n",
+ " weight_col='weight'\n",
+ " )\n",
+ " \n",
+ " # Predict\n",
+ " df_signal = trainer.predict(df_test)\n",
+ " \n",
+ " # Backtest\n",
+ " returns = df_test['return'] if 'return' in df_test.columns else df_test['label']\n",
+ " backtester = CTABacktester(**BASE_CONFIG['backtest_params'])\n",
+ " results = backtester.run(returns, df_signal)\n",
+ " \n",
+ " # Metrics\n",
+ " summary = backtester.summary()\n",
+ " ic_by_date = results.groupby(results.index.get_level_values(0))['ic'].mean()\n",
+ " \n",
+ " return {\n",
+ " 'name': blend_name,\n",
+ " 'weights': blend_weights,\n",
+ " 'summary': summary,\n",
+ " 'ic_by_date': ic_by_date,\n",
+ " 'results': results,\n",
+ " 'importance': trainer.get_feature_importance(),\n",
+ " }\n",
+ "\n",
+ "# Run all experiments\n",
+ "all_results = []\n",
+ "for name in BLEND_CONFIGS.keys():\n",
+ " result = run_single_experiment(name, name)\n",
+ " all_results.append(result)\n",
+ " \n",
+ "print(\"\\n\\nAll experiments complete!\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Results Summary"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create comparison table\n",
+ "comparison_data = []\n",
+ "for r in all_results:\n",
+ " ic_mean = r['ic_by_date'].mean()\n",
+ " ic_std = r['ic_by_date'].std()\n",
+ " comparison_data.append({\n",
+ " 'Blend': r['name'],\n",
+ " 'Weights': str(r['weights']),\n",
+ " 'IC Mean': ic_mean,\n",
+ " 'IC Std': ic_std,\n",
+ " 'IR': ic_mean / ic_std if ic_std > 0 else 0,\n",
+ " 'Return': r['summary'].get('return', np.nan),\n",
+ " 'Sharpe': r['summary'].get('sharpe', np.nan),\n",
+ " 'Turnover': r['summary'].get('turnover', np.nan),\n",
+ " })\n",
+ "\n",
+ "df_comparison = pd.DataFrame(comparison_data)\n",
+ "\n",
+ "# Sort by IC Mean\n",
+ "df_comparison = df_comparison.sort_values('IC Mean', ascending=False)\n",
+ "\n",
+ "print(\"Comparison Summary (sorted by IC Mean):\")\n",
+ "print(df_comparison.to_string(index=False))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Visual comparison\n",
+ "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
+ "\n",
+ "# IC Mean\n",
+ "axes[0, 0].barh(df_comparison['Blend'], df_comparison['IC Mean'])\n",
+ "axes[0, 0].set_title('IC Mean')\n",
+ "axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
+ "\n",
+ "# Information Ratio\n",
+ "axes[0, 1].barh(df_comparison['Blend'], df_comparison['IR'])\n",
+ "axes[0, 1].set_title('Information Ratio')\n",
+ "axes[0, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
+ "\n",
+ "# Return\n",
+ "axes[1, 0].barh(df_comparison['Blend'], df_comparison['Return'])\n",
+ "axes[1, 0].set_title('Return')\n",
+ "axes[1, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
+ "\n",
+ "# Sharpe\n",
+ "axes[1, 1].barh(df_comparison['Blend'], df_comparison['Sharpe'])\n",
+ "axes[1, 1].set_title('Sharpe Ratio')\n",
+ "axes[1, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. IC Time Series Comparison"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Plot IC series for all configurations\n",
+ "fig, ax = plt.subplots(figsize=(16, 6))\n",
+ "\n",
+ "for r in all_results:\n",
+ " ic_rolling = r['ic_by_date'].rolling(20, min_periods=5).mean()\n",
+ " ax.plot(ic_rolling.index, ic_rolling.values, label=r['name'], alpha=0.8)\n",
+ "\n",
+ "ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)\n",
+ "ax.set_title('Rolling IC Comparison (20-day MA)')\n",
+ "ax.set_xlabel('Date')\n",
+ "ax.set_ylabel('Information Coefficient')\n",
+ "ax.legend(loc='upper right')\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Feature Importance Comparison"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get top features from each blend\n",
+ "n_top = 10\n",
+ "top_features_by_blend = {}\n",
+ "\n",
+ "for r in all_results:\n",
+ " top_features_by_blend[r['name']] = set(r['importance'].head(n_top).index)\n",
+ "\n",
+ "# Find common features across all blends\n",
+ "common_features = set.intersection(*top_features_by_blend.values())\n",
+ "print(f\"\\nCommon top-{n_top} features across all blends:\")\n",
+ "for f in sorted(common_features):\n",
+ " print(f\" - {f}\")\n",
+ "\n",
+ "# Find unique features per blend\n",
+ "print(\"\\nUnique top features by blend:\")\n",
+ "for name, features in top_features_by_blend.items():\n",
+ " unique = features - set.union(*(top_features_by_blend.values() - {features}))\n",
+ " if unique:\n",
+ " print(f\"\\n {name}:\")\n",
+ " for f in sorted(unique):\n",
+ " print(f\" - {f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Heatmap of top feature importance across blends\n",
+ "all_top_features = set.union(*top_features_by_blend.values())\n",
+ "\n",
+ "importance_matrix = []\n",
+ "for r in all_results:\n",
+ " row = []\n",
+ " for f in sorted(all_top_features):\n",
+ " if f in r['importance'].index:\n",
+ " row.append(r['importance'].loc[f, 'importance'])\n",
+ " else:\n",
+ " row.append(0)\n",
+ " importance_matrix.append(row)\n",
+ "\n",
+ "df_importance = pd.DataFrame(\n",
+ " importance_matrix,\n",
+ " index=[r['name'] for r in all_results],\n",
+ " columns=sorted(all_top_features)\n",
+ ")\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(14, 6))\n",
+ "sns.heatmap(df_importance, cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Importance'})\n",
+ "ax.set_title('Feature Importance Comparison Across Blends')\n",
+ "ax.set_xlabel('Features')\n",
+ "ax.set_ylabel('Blend Configuration')\n",
+ "plt.xticks(rotation=45, ha='right')\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Custom Weight Exploration\n",
+ "\n",
+ "Test custom blend weights."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Define custom weights to test\n",
+ "CUSTOM_WEIGHTS = [\n",
+ " [0.0, 0.0, 0.5, 0.5], # Only rolling\n",
+ " [0.3, 0.3, 0.2, 0.2], # Fit-time heavy\n",
+ " [0.1, 0.4, 0.25, 0.25], # CS heavy + balanced rolling\n",
+ "]\n",
+ "\n",
+ "custom_results = []\n",
+ "for i, weights in enumerate(CUSTOM_WEIGHTS):\n",
+ " result = run_single_experiment(f'custom_{i+1}', weights)\n",
+ " custom_results.append(result)\n",
+ "\n",
+ "print(\"\\n\\nCustom weights experiments complete!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Compare custom with standard\n",
+ "all_comparison_data = comparison_data.copy()\n",
+ "\n",
+ "for r in custom_results:\n",
+ " ic_mean = r['ic_by_date'].mean()\n",
+ " ic_std = r['ic_by_date'].std()\n",
+ " all_comparison_data.append({\n",
+ " 'Blend': r['name'],\n",
+ " 'Weights': str(r['weights']),\n",
+ " 'IC Mean': ic_mean,\n",
+ " 'IC Std': ic_std,\n",
+ " 'IR': ic_mean / ic_std if ic_std > 0 else 0,\n",
+ " 'Return': r['summary'].get('return', np.nan),\n",
+ " 'Sharpe': r['summary'].get('sharpe', np.nan),\n",
+ " 'Turnover': r['summary'].get('turnover', np.nan),\n",
+ " })\n",
+ "\n",
+ "df_all = pd.DataFrame(all_comparison_data)\n",
+ "df_all = df_all.sort_values('IC Mean', ascending=False)\n",
+ "\n",
+ "print(\"All Results (standard + custom):\")\n",
+ "print(df_all.to_string(index=False))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 7. Conclusion\n",
+ "\n",
+ "Summarize findings and recommend best blend configuration."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Best configuration\n",
+ "best = df_comparison.iloc[0]\n",
+ "print(\"Recommended Blend Configuration:\")\n",
+ "print(f\" Name: {best['Blend']}\")\n",
+ "print(f\" Weights: {best['Weights']}\")\n",
+ "print(f\"\\nPerformance:\")\n",
+ "print(f\" IC Mean: {best['IC Mean']:.4f}\")\n",
+ "print(f\" IC Std: {best['IC Std']:.4f}\")\n",
+ "print(f\" IR: {best['IR']:.4f}\")\n",
+ "print(f\" Return: {best['Return']:.4f}\")\n",
+ "print(f\" Sharpe: {best['Sharpe']:.4f}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.8.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/cta_1d/README.md b/cta_1d/README.md
new file mode 100644
index 0000000..664105a
--- /dev/null
+++ b/cta_1d/README.md
@@ -0,0 +1,36 @@
+# CTA 1-Day Return Prediction
+
+Experiments for predicting CTA (Commodity Trading Advisor) futures 1-day returns.
+
+## Data
+
+- **Features**: alpha158, hffactor
+- **Labels**: Return indicators (o2c_twap1min, o2o_twap1min, etc.)
+- **Normalization**: dual (blend of zscore, cs_zscore, rolling_20, rolling_60)
+
+## Notebooks
+
+| Notebook | Purpose |
+|----------|---------|
+| `01_data_check.ipynb` | Load and validate CTA data |
+| `02_label_analysis.ipynb` | Explore label distributions and blending |
+| `03_baseline_xgb.ipynb` | Train baseline XGBoost model |
+| `04_blend_comparison.ipynb` | Compare different normalization blends |
+
+## Blend Configurations
+
+The label blending combines 4 normalization methods:
+- **zscore**: Fit-time mean/std normalization
+- **cs_zscore**: Cross-sectional z-score per datetime
+- **rolling_20**: 20-day rolling window normalization
+- **rolling_60**: 60-day rolling window normalization
+
+Predefined weights (from qshare.config.research.cta.labels):
+- `equal`: [0.25, 0.25, 0.25, 0.25]
+- `zscore_heavy`: [0.5, 0.2, 0.15, 0.15]
+- `rolling_heavy`: [0.1, 0.1, 0.3, 0.5]
+- `cs_heavy`: [0.2, 0.5, 0.15, 0.15]
+- `short_term`: [0.1, 0.1, 0.4, 0.4]
+- `long_term`: [0.4, 0.2, 0.2, 0.2]
+
+Default: [0.2, 0.1, 0.3, 0.4]
diff --git a/cta_1d/src/__init__.py b/cta_1d/src/__init__.py
new file mode 100644
index 0000000..615b18d
--- /dev/null
+++ b/cta_1d/src/__init__.py
@@ -0,0 +1,5 @@
+"""CTA 1-day task-specific utilities."""
+
+from .labels import get_blend_weights, describe_blend_config
+
+__all__ = ['get_blend_weights', 'describe_blend_config']
diff --git a/cta_1d/src/labels.py b/cta_1d/src/labels.py
new file mode 100644
index 0000000..f43a44e
--- /dev/null
+++ b/cta_1d/src/labels.py
@@ -0,0 +1,63 @@
+"""Label blending utilities for CTA experiments."""
+
+from typing import Union, List
+
+
+# Predefined blend configurations
+BLEND_CONFIGS = {
+ 'equal': [0.25, 0.25, 0.25, 0.25],
+ 'zscore_heavy': [0.5, 0.2, 0.15, 0.15],
+ 'rolling_heavy': [0.1, 0.1, 0.3, 0.5],
+ 'cs_heavy': [0.2, 0.5, 0.15, 0.15],
+ 'short_term': [0.1, 0.1, 0.4, 0.4],
+ 'long_term': [0.4, 0.2, 0.2, 0.2],
+}
+
+DEFAULT_BLEND = [0.2, 0.1, 0.3, 0.4] # [zscore, cs_zscore, roll20, roll60]
+
+
+def get_blend_weights(weights: Union[str, List[float], None]) -> List[float]:
+ """Resolve blend weights from string name or list.
+
+ Args:
+ weights: Config name, list of 4 floats, or None for default
+
+ Returns:
+ List of 4 weights summing to 1.0
+ """
+ if weights is None:
+ return DEFAULT_BLEND
+
+ if isinstance(weights, str):
+ if weights not in BLEND_CONFIGS:
+ raise ValueError(f"Unknown blend config: {weights}. "
+ f"Available: {list(BLEND_CONFIGS.keys())}")
+ return BLEND_CONFIGS[weights]
+
+ if isinstance(weights, (list, tuple)):
+ if len(weights) != 4:
+ raise ValueError(f"Blend weights must have 4 values, got {len(weights)}")
+ if abs(sum(weights) - 1.0) > 1e-6:
+ raise ValueError(f"Blend weights must sum to 1.0, got {sum(weights)}")
+ return list(weights)
+
+ raise ValueError(f"Invalid blend weights type: {type(weights)}")
+
+
+def describe_blend_config(weights: Union[str, List[float]]) -> str:
+ """Get human-readable description of blend config.
+
+ Args:
+ weights: Config name or list of weights
+
+ Returns:
+ Description string
+ """
+ names = ['zscore', 'cs_zscore', 'rolling_20', 'rolling_60']
+
+ if isinstance(weights, str):
+ w = get_blend_weights(weights)
+ return f"{weights}: {dict(zip(names, w))}"
+
+ w = weights
+ return f"custom: {dict(zip(names, w))}"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..a518866
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,22 @@
+# Alpha Lab - Experiment dependencies
+# The qshare library is already installed in the virtual environment
+
+# Jupyter and visualization
+jupyter>=7.0.0
+matplotlib>=3.7.0
+seaborn>=0.12.0
+plotly>=5.18.0
+
+# Data processing
+pandas>=2.0.0
+numpy>=1.24.0
+polars>=0.20.0
+pyarrow>=14.0.0
+
+# Machine learning
+xgboost>=2.0.0
+scikit-learn>=1.3.0
+
+# Utilities
+tqdm>=4.65.0
+python-dotenv>=1.0.0
diff --git a/stock_15m/01_data_exploration.ipynb b/stock_15m/01_data_exploration.ipynb
new file mode 100644
index 0000000..c9726c5
--- /dev/null
+++ b/stock_15m/01_data_exploration.ipynb
@@ -0,0 +1,810 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Stock 15m Data Exploration\n",
+ "\n",
+ "Load and explore 15-minute return prediction data.\n",
+ "\n",
+ "**Purpose**: Understand data structure, check data quality, and visualize key statistics."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import polars as pl\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "from qshare.data.polars.ret15m import load_dataset, calculate_weights\n",
+ "from qshare.io.polars import load_from_pq\n",
+ "\n",
+ "import sys\n",
+ "sys.path.insert(0, '../')\n",
+ "from common.plotting import setup_plot_style\n",
+ "\n",
+ "setup_plot_style()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Configuration\n",
+ "\n",
+ "Define data paths and parameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CONFIG = {\n",
+ " # Data paths (adjust as needed)\n",
+ " 'path_a158': '/data/parquet/stock_1min_alpha158',\n",
+ " 'path_kline': '/data/parquet/stock_1min',\n",
+ " 'path_kline_daily': '/data/parquet/stock_1day',\n",
+ " 'path_industry': '/data/parquet/industry_idx',\n",
+ " \n",
+ " # Date range\n",
+ " 'dt_range': ['2022-01-01', '2024-12-31'],\n",
+ " \n",
+ " # Normalization mode\n",
+ " 'normalization_mode': 'dual', # 'industry', 'cs_zscore', or 'dual'\n",
+ " \n",
+ " # Sample weights\n",
+ " 'positive_factor': 1.0,\n",
+ " 'negative_factor': 2.0,\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Load Raw Data\n",
+ "\n",
+ "Load data as Polars lazy frames first."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load data sources\n",
+ "print(\"Loading data sources...\")\n",
+ "\n",
+ "pl_ldf_a158 = load_from_pq(\n",
+ " path=CONFIG['path_a158'],\n",
+ " table_alias=\"a158\",\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ " as_struct=True\n",
+ ")\n",
+ "\n",
+ "pl_ldf_kline = load_from_pq(\n",
+ " path=CONFIG['path_kline'],\n",
+ " table_alias=\"kline_1min\",\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ " as_struct=True\n",
+ ")\n",
+ "\n",
+ "pl_ldf_kline_daily = load_from_pq(\n",
+ " path=CONFIG['path_kline_daily'],\n",
+ " table_alias=\"kline_1day\",\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ ")\n",
+ "\n",
+ "pl_ldf_industry = load_from_pq(\n",
+ " path=CONFIG['path_industry'],\n",
+ " table_alias=\"indus_idx\",\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ ")\n",
+ "\n",
+ "print(\"Data sources loaded as lazy frames\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check schemas\n",
+ "print(\"Alpha158 schema:\")\n",
+ "print(pl_ldf_a158.schema)\n",
+ "\n",
+ "print(\"\\nKline 1min schema:\")\n",
+ "print(pl_ldf_kline.schema)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Load Training Dataset\n",
+ "\n",
+ "Use qshare's load_dataset to construct the full training data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Loading training dataset...\")\n",
+ "print(f\" Date range: {CONFIG['dt_range']}\")\n",
+ "print(f\" Normalization: {CONFIG['normalization_mode']}\")\n",
+ "\n",
+ "pl_df_train = load_dataset(\n",
+ " pl_ldf_a158_1min=pl_ldf_a158,\n",
+ " pl_ldf_kline_1min=pl_ldf_kline,\n",
+ " pl_ldf_kline_1day=pl_ldf_kline_daily,\n",
+ " pl_ldf_indus_idx=pl_ldf_industry,\n",
+ " dt_range=CONFIG['dt_range'],\n",
+ " normalization_mode=CONFIG['normalization_mode'],\n",
+ " negative_factor=CONFIG['negative_factor'],\n",
+ " positive_factor=CONFIG['positive_factor'],\n",
+ ")\n",
+ "\n",
+ "# Convert to pandas for easier exploration\n",
+ "df_train = pl_df_train.to_pandas()\n",
+ "\n",
+ "print(f\"\\nDataset shape: {df_train.shape}\")\n",
+ "print(f\"Columns: {len(df_train.columns)}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check column types\n",
+ "feature_cols = [c for c in df_train.columns if c.startswith('alpha158_')]\n",
+ "print(f\"\\nAlpha158 features: {len(feature_cols)}\")\n",
+ "print(f\" Example: {feature_cols[:5]}\")\n",
+ "\n",
+ "print(f\"\\nTarget column: {[c for c in df_train.columns if 'return' in c.lower()]}\")\n",
+ "print(f\"Weight column: {[c for c in df_train.columns if 'weight' in c.lower()]}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Data Quality Check"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Missing values\n",
+ "missing = df_train.isnull().sum()\n",
+ "missing_pct = missing / len(df_train) * 100\n",
+ "\n",
+ "print(\"Missing values:\")\n",
+ "print(f\" Columns with missing: {(missing > 0).sum()}\")\n",
+ "if (missing > 0).sum() > 0:\n",
+ " print(\"\\nTop columns by missing %:\")\n",
+ " print(missing_pct[missing_pct > 0].sort_values(ascending=False).head(10))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Data coverage by date\n",
+ "df_train['datetime'] = pd.to_datetime(df_train.index.get_level_values(0))\n",
+ "df_train['instrument'] = df_train.index.get_level_values(1)\n",
+ "\n",
+ "daily_counts = df_train.groupby('datetime')['instrument'].nunique()\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(14, 4))\n",
+ "daily_counts.plot(ax=ax)\n",
+ "ax.set_title('Number of Instruments per Day')\n",
+ "ax.set_xlabel('Date')\n",
+ "ax.set_ylabel('Instrument Count')\n",
+ "plt.tight_layout()\n",
+ "plt.show()\n",
+ "\n",
+ "print(f\"\\nInstruments per day: {daily_counts.mean():.0f} avg, {daily_counts.min()}-{daily_counts.max()} range\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Target Analysis\n",
+ "\n",
+ "Analyze the 15-minute return target distribution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Identify target column\n",
+ "target_col = [c for c in df_train.columns if 'return' in c.lower()][0]\n",
+ "print(f\"Target column: {target_col}\")\n",
+ "\n",
+ "# Target statistics\n",
+ "print(f\"\\nTarget statistics:\")\n",
+ "print(df_train[target_col].describe())\n",
+ "\n",
+ "print(f\"\\nSkewness: {df_train[target_col].skew():.3f}\")\n",
+ "print(f\"Kurtosis: {df_train[target_col].kurtosis():.3f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Target distribution\n",
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
+ "\n",
+ "# Histogram\n",
+ "df_train[target_col].hist(bins=100, ax=axes[0], edgecolor='black', alpha=0.7)\n",
+ "axes[0].set_title(f'{target_col} Distribution')\n",
+ "axes[0].axvline(x=0, color='red', linestyle='--')\n",
+ "axes[0].set_xlim(-0.05, 0.05) # Focus on main distribution\n",
+ "\n",
+ "# Time series of daily mean target\n",
+ "daily_mean_target = df_train.groupby('datetime')[target_col].mean()\n",
+ "axes[1].plot(daily_mean_target.index, daily_mean_target.values)\n",
+ "axes[1].set_title('Daily Mean Target')\n",
+ "axes[1].axhline(y=0, color='red', linestyle='--')\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Feature Analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Feature statistics\n",
+ "feature_stats = df_train[feature_cols].describe().T\n",
+ "\n",
+ "print(\"Feature statistics summary:\")\n",
+ "print(f\" Mean range: [{feature_stats['mean'].min():.4f}, {feature_stats['mean'].max():.4f}]\")\n",
+ "print(f\" Std range: [{feature_stats['std'].min():.4f}, {feature_stats['std'].max():.4f}]\")\n",
+ "\n",
+ "# Check for features with extreme values\n",
+ "extreme_features = feature_stats[\n",
+ " (feature_stats['mean'].abs() > 10) | (feature_stats['std'] > 100)\n",
+ "]\n",
+ "if len(extreme_features) > 0:\n",
+ " print(f\"\\nFeatures with extreme values: {len(extreme_features)}\")\n",
+ " print(extreme_features.head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Sample a few features for visualization\n",
+ "sample_features = feature_cols[:4]\n",
+ "\n",
+ "fig, axes = plt.subplots(2, 2, figsize=(12, 8))\n",
+ "axes = axes.flatten()\n",
+ "\n",
+ "for i, feat in enumerate(sample_features):\n",
+ " df_train[feat].hist(bins=100, ax=axes[i], edgecolor='black', alpha=0.7)\n",
+ " axes[i].set_title(feat)\n",
+ " axes[i].axvline(x=0, color='red', linestyle='--')\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 7. Sample Weights Analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check weights if available\n",
+ "weight_cols = [c for c in df_train.columns if 'weight' in c.lower()]\n",
+ "if weight_cols:\n",
+ " weight_col = weight_cols[0]\n",
+ " print(f\"Weight column: {weight_col}\")\n",
+ " print(f\"\\nWeight statistics:\")\n",
+ " print(df_train[weight_col].describe())\n",
+ " \n",
+ " # Plot weight distribution by target sign\n",
+ " fig, ax = plt.subplots(figsize=(10, 4))\n",
+ " \n",
+ " positive_mask = df_train[target_col] > 0\n",
+ " df_train.loc[positive_mask, weight_col].hist(\n",
+ " bins=50, alpha=0.5, label='Positive target', ax=ax\n",
+ " )\n",
+ " df_train.loc[~positive_mask, weight_col].hist(\n",
+ " bins=50, alpha=0.5, label='Negative target', ax=ax\n",
+ " )\n",
+ " ax.set_title('Weight Distribution by Target Sign')\n",
+ " ax.legend()\n",
+ " plt.tight_layout()\n",
+ " plt.show()\n",
+ "else:\n",
+ " print(\"No weight column found\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.8.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+
+
+Let me now create the baseline model notebook for stock_15m:
+
+
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Stock 15m Baseline Model\n",
+ "\n",
+ "Train and evaluate a baseline XGBoost model for 15-minute return prediction.\n",
+ "\n",
+ "**Purpose**: Establish baseline performance with standard configuration."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import polars as pl\n",
+ "import matplotlib.pyplot as plt\n",
+ "import xgboost as xgb\n",
+ "from sklearn.metrics import r2_score\n",
+ "\n",
+ "from qshare.data.polars.ret15m import load_dataset\n",
+ "from qshare.io.polars import load_from_pq\n",
+ "\n",
+ "import sys\n",
+ "sys.path.insert(0, '../')\n",
+ "from common.plotting import setup_plot_style\n",
+ "from common.paths import create_experiment_dir\n",
+ "\n",
+ "setup_plot_style()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CONFIG = {\n",
+ " # Experiment\n",
+ " 'experiment_name': 'baseline_xgb',\n",
+ " 'save_results': True,\n",
+ " \n",
+ " # Data paths\n",
+ " 'path_a158': '/data/parquet/stock_1min_alpha158',\n",
+ " 'path_kline': '/data/parquet/stock_1min',\n",
+ " 'path_kline_daily': '/data/parquet/stock_1day',\n",
+ " 'path_industry': '/data/parquet/industry_idx',\n",
+ " \n",
+ " # Date ranges\n",
+ " 'dt_range': ['2022-01-01', '2024-12-31'],\n",
+ " 'train_range': ['2022-01-01', '2023-12-31'],\n",
+ " 'test_range': ['2024-01-01', '2024-12-31'],\n",
+ " \n",
+ " # Data config\n",
+ " 'normalization_mode': 'dual',\n",
+ " 'positive_factor': 1.0,\n",
+ " 'negative_factor': 2.0,\n",
+ " \n",
+ " # Model\n",
+ " 'model_params': {\n",
+ " 'objective': 'reg:squarederror',\n",
+ " 'eval_metric': 'rmse',\n",
+ " 'max_depth': 6,\n",
+ " 'learning_rate': 0.1,\n",
+ " 'n_estimators': 100,\n",
+ " 'subsample': 0.8,\n",
+ " 'colsample_bytree': 0.8,\n",
+ " 'random_state': 42,\n",
+ " },\n",
+ "}\n",
+ "\n",
+ "print(\"Configuration:\")\n",
+ "for key, value in CONFIG.items():\n",
+ " if not isinstance(value, dict):\n",
+ " print(f\" {key}: {value}\")\n",
+ "print(f\"\\nModel params: {CONFIG['model_params']}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Load Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Loading data sources...\")\n",
+ "\n",
+ "pl_ldf_a158 = load_from_pq(\n",
+ " path=CONFIG['path_a158'],\n",
+ " table_alias=\"a158\",\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ " as_struct=True\n",
+ ")\n",
+ "\n",
+ "pl_ldf_kline = load_from_pq(\n",
+ " path=CONFIG['path_kline'],\n",
+ " table_alias=\"kline_1min\",\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ " as_struct=True\n",
+ ")\n",
+ "\n",
+ "pl_ldf_kline_daily = load_from_pq(\n",
+ " path=CONFIG['path_kline_daily'],\n",
+ " table_alias=\"kline_1day\",\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ ")\n",
+ "\n",
+ "pl_ldf_industry = load_from_pq(\n",
+ " path=CONFIG['path_industry'],\n",
+ " table_alias=\"indus_idx\",\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ ")\n",
+ "\n",
+ "print(\"Loading dataset...\")\n",
+ "pl_df = load_dataset(\n",
+ " pl_ldf_a158_1min=pl_ldf_a158,\n",
+ " pl_ldf_kline_1min=pl_ldf_kline,\n",
+ " pl_ldf_kline_1day=pl_ldf_kline_daily,\n",
+ " pl_ldf_indus_idx=pl_ldf_industry,\n",
+ " dt_range=CONFIG['dt_range'],\n",
+ " normalization_mode=CONFIG['normalization_mode'],\n",
+ " negative_factor=CONFIG['negative_factor'],\n",
+ " positive_factor=CONFIG['positive_factor'],\n",
+ ")\n",
+ "\n",
+ "# Convert to pandas\n",
+ "df_full = pl_df.to_pandas()\n",
+ "print(f\"\\nFull dataset shape: {df_full.shape}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Prepare Train/Test Split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Identify columns\n",
+ "feature_cols = [c for c in df_full.columns if c.startswith('alpha158_')]\n",
+ "target_cols = [c for c in df_full.columns if 'return' in c.lower()]\n",
+ "weight_cols = [c for c in df_full.columns if 'weight' in c.lower()]\n",
+ "\n",
+ "print(f\"Features: {len(feature_cols)}\")\n",
+ "print(f\"Targets: {target_cols}\")\n",
+ "print(f\"Weights: {weight_cols}\")\n",
+ "\n",
+ "# Select target\n",
+ "target_col = target_cols[0]\n",
+ "weight_col = weight_cols[0] if weight_cols else None\n",
+ "\n",
+ "# Split by date\n",
+ "df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n",
+ "df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n",
+ "\n",
+ "print(f\"\\nTrain: {df_train.shape}\")\n",
+ "print(f\"Test: {df_test.shape}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Train Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prepare data\n",
+ "X_train = df_train[feature_cols]\n",
+ "y_train = df_train[target_col]\n",
+ "w_train = df_train[weight_col] if weight_col else None\n",
+ "\n",
+ "X_test = df_test[feature_cols]\n",
+ "y_test = df_test[target_col]\n",
+ "\n",
+ "# Handle missing values\n",
+ "X_train = X_train.fillna(X_train.median())\n",
+ "X_test = X_test.fillna(X_train.median()) # Use train median\n",
+ "\n",
+ "print(\"Training XGBoost model...\")\n",
+ "print(f\" X shape: {X_train.shape}\")\n",
+ "print(f\" y mean: {y_train.mean():.6f}, std: {y_train.std():.6f}\")\n",
+ "\n",
+ "model = xgb.XGBRegressor(**CONFIG['model_params'])\n",
+ "\n",
+ "model.fit(\n",
+ " X_train, y_train,\n",
+ " sample_weight=w_train,\n",
+ " eval_set=[(X_test, y_test)],\n",
+ " verbose=False\n",
+ ")\n",
+ "\n",
+ "print(\"Training complete!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Feature importance\n",
+ "importance = pd.DataFrame({\n",
+ " 'feature': feature_cols,\n",
+ " 'importance': model.feature_importances_\n",
+ "}).sort_values('importance', ascending=False)\n",
+ "\n",
+ "print(\"\\nTop 10 Features:\")\n",
+ "print(importance.head(10))\n",
+ "\n",
+ "# Plot\n",
+ "fig, ax = plt.subplots(figsize=(10, 6))\n",
+ "importance.head(20).plot(x='feature', y='importance', kind='barh', ax=ax)\n",
+ "ax.set_title('Top 20 Feature Importance')\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Evaluate"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Generate predictions\n",
+ "y_pred_train = model.predict(X_train)\n",
+ "y_pred_test = model.predict(X_test)\n",
+ "\n",
+ "# Calculate metrics\n",
+ "train_r2 = r2_score(y_train, y_pred_train)\n",
+ "test_r2 = r2_score(y_test, y_pred_test)\n",
+ "\n",
+ "# IC (Information Coefficient)\n",
+ "train_ic = np.corrcoef(y_train, y_pred_train)[0, 1]\n",
+ "test_ic = np.corrcoef(y_test, y_pred_test)[0, 1]\n",
+ "\n",
+ "print(\"Performance Metrics:\")\n",
+ "print(f\" Train R2: {train_r2:.4f}\")\n",
+ "print(f\" Test R2: {test_r2:.4f}\")\n",
+ "print(f\" Train IC: {train_ic:.4f}\")\n",
+ "print(f\" Test IC: {test_ic:.4f}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Daily IC analysis\n",
+ "df_test_eval = df_test.copy()\n",
+ "df_test_eval['pred'] = y_pred_test\n",
+ "df_test_eval['target'] = y_test\n",
+ "\n",
+ "df_test_eval['datetime'] = df_test_eval.index.get_level_values(0)\n",
+ "\n",
+ "# Calculate daily IC\n",
+ "daily_ic = df_test_eval.groupby('datetime').apply(\n",
+ " lambda x: x['target'].corr(x['pred'])\n",
+ ")\n",
+ "\n",
+ "print(\"\\nDaily IC Statistics:\")\n",
+ "print(f\" Mean: {daily_ic.mean():.4f}\")\n",
+ "print(f\" Std: {daily_ic.std():.4f}\")\n",
+ "print(f\" IR: {daily_ic.mean() / daily_ic.std():.4f}\")\n",
+ "print(f\" >0: {(daily_ic > 0).mean():.1%}\")\n",
+ "\n",
+ "# Plot\n",
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
+ "\n",
+ "# IC distribution\n",
+ "daily_ic.hist(bins=50, ax=axes[0], edgecolor='black')\n",
+ "axes[0].axvline(x=0, color='red', linestyle='--')\n",
+ "axes[0].axvline(x=daily_ic.mean(), color='green', linestyle='--', label=f'Mean: {daily_ic.mean():.3f}')\n",
+ "axes[0].set_title('Daily IC Distribution')\n",
+ "axes[0].legend()\n",
+ "\n",
+ "# IC time series\n",
+ "daily_ic.rolling(20, min_periods=5).mean().plot(ax=axes[1])\n",
+ "axes[1].axhline(y=0, color='red', linestyle='--')\n",
+ "axes[1].set_title('Rolling IC (20-day)')\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Prediction vs Actual scatter\n",
+ "fig, ax = plt.subplots(figsize=(8, 8))\n",
+ "\n",
+ "# Sample for plotting\n",
+ "sample_idx = np.random.choice(len(y_test), size=min(10000, len(y_test)), replace=False)\n",
+ "ax.scatter(y_test.iloc[sample_idx], y_pred_test[sample_idx], alpha=0.3, s=1)\n",
+ "\n",
+ "# Perfect prediction line\n",
+ "lims = [min(y_test.min(), y_pred_test.min()), max(y_test.max(), y_pred_test.max())]\n",
+ "ax.plot(lims, lims, 'r--', alpha=0.5)\n",
+ "\n",
+ "ax.set_xlabel('Actual')\n",
+ "ax.set_ylabel('Predicted')\n",
+ "ax.set_title(f'Prediction vs Actual (IC={test_ic:.3f})')\n",
+ "ax.grid(True, alpha=0.3)\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Save Results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if CONFIG['save_results']:\n",
+ " import pickle\n",
+ " import json\n",
+ " \n",
+ " output_dir = create_experiment_dir('stock_15m', CONFIG['experiment_name'])\n",
+ " print(f\"Saving results to: {output_dir}\")\n",
+ " \n",
+ " # Save config\n",
+ " with open(output_dir / 'config.json', 'w') as f:\n",
+ " json.dump(CONFIG, f, indent=2, default=str)\n",
+ " \n",
+ " # Save model\n",
+ " with open(output_dir / 'model.pkl', 'wb') as f:\n",
+ " pickle.dump(model, f)\n",
+ " \n",
+ " # Save importance\n",
+ " importance.to_csv(output_dir / 'feature_importance.csv', index=False)\n",
+ " \n",
+ " # Save predictions\n",
+ " predictions = pd.DataFrame({\n",
+ " 'actual': y_test,\n",
+ " 'predicted': y_pred_test\n",
+ " }, index=df_test.index)\n",
+ " predictions.to_csv(output_dir / 'predictions.csv')\n",
+ " \n",
+ " # Save metrics\n",
+ " metrics = {\n",
+ " 'train_r2': float(train_r2),\n",
+ " 'test_r2': float(test_r2),\n",
+ " 'train_ic': float(train_ic),\n",
+ " 'test_ic': float(test_ic),\n",
+ " 'daily_ic_mean': float(daily_ic.mean()),\n",
+ " 'daily_ic_std': float(daily_ic.std()),\n",
+ " 'daily_ir': float(daily_ic.mean() / daily_ic.std()),\n",
+ " }\n",
+ " with open(output_dir / 'metrics.json', 'w') as f:\n",
+ " json.dump(metrics, f, indent=2)\n",
+ " \n",
+ " print(\"\\nFiles saved:\")\n",
+ " for f in output_dir.iterdir():\n",
+ " print(f\" - {f.name}\")\n",
+ "else:\n",
+ " print(\"Results not saved\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.8.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/stock_15m/02_baseline_model.ipynb b/stock_15m/02_baseline_model.ipynb
new file mode 100644
index 0000000..6fbf850
--- /dev/null
+++ b/stock_15m/02_baseline_model.ipynb
@@ -0,0 +1,257 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Stock 15m Baseline Model\n",
+ "\n",
+ "Train and evaluate a baseline XGBoost model for 15-minute return prediction.\n",
+ "\n",
+ "**Purpose**: Establish baseline performance with standard configuration."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import polars as pl\n",
+ "import matplotlib.pyplot as plt\n",
+ "import xgboost as xgb\n",
+ "from sklearn.metrics import r2_score\n",
+ "\n",
+ "from qshare.data.polars.ret15m import load_dataset\n",
+ "from qshare.io.polars import load_from_pq\n",
+ "\n",
+ "import sys\n",
+ "sys.path.insert(0, '../')\n",
+ "from common.plotting import setup_plot_style\n",
+ "from common.paths import create_experiment_dir\n",
+ "\n",
+ "setup_plot_style()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Configuration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "CONFIG = {\n",
+ " 'experiment_name': 'baseline_xgb',\n",
+ " 'save_results': True,\n",
+ " 'path_a158': '/data/parquet/stock_1min_alpha158',\n",
+ " 'path_kline': '/data/parquet/stock_1min',\n",
+ " 'path_kline_daily': '/data/parquet/stock_1day',\n",
+ " 'path_industry': '/data/parquet/industry_idx',\n",
+ " 'dt_range': ['2022-01-01', '2024-12-31'],\n",
+ " 'train_range': ['2022-01-01', '2023-12-31'],\n",
+ " 'test_range': ['2024-01-01', '2024-12-31'],\n",
+ " 'normalization_mode': 'dual',\n",
+ " 'positive_factor': 1.0,\n",
+ " 'negative_factor': 2.0,\n",
+ " 'model_params': {\n",
+ " 'objective': 'reg:squarederror',\n",
+ " 'eval_metric': 'rmse',\n",
+ " 'max_depth': 6,\n",
+ " 'learning_rate': 0.1,\n",
+ " 'n_estimators': 100,\n",
+ " 'subsample': 0.8,\n",
+ " 'colsample_bytree': 0.8,\n",
+ " 'random_state': 42,\n",
+ " },\n",
+ "}\n",
+ "\n",
+ "print('Configuration:')\n",
+ "for key, value in CONFIG.items():\n",
+ " if not isinstance(value, dict):\n",
+ " print(f' {key}: {value}')\n",
+ "print(f\"Model params: {CONFIG['model_params']}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Load Data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print('Loading data sources...')\n",
+ "\n",
+ "pl_ldf_a158 = load_from_pq(\n",
+ " path=CONFIG['path_a158'],\n",
+ " table_alias='a158',\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ " as_struct=True\n",
+ ")\n",
+ "\n",
+ "pl_ldf_kline = load_from_pq(\n",
+ " path=CONFIG['path_kline'],\n",
+ " table_alias='kline_1min',\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ " as_struct=True\n",
+ ")\n",
+ "\n",
+ "pl_ldf_kline_daily = load_from_pq(\n",
+ " path=CONFIG['path_kline_daily'],\n",
+ " table_alias='kline_1day',\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ ")\n",
+ "\n",
+ "pl_ldf_industry = load_from_pq(\n",
+ " path=CONFIG['path_industry'],\n",
+ " table_alias='indus_idx',\n",
+ " start_time=CONFIG['dt_range'][0],\n",
+ ")\n",
+ "\n",
+ "print('Loading dataset...')\n",
+ "pl_df = load_dataset(\n",
+ " pl_ldf_a158_1min=pl_ldf_a158,\n",
+ " pl_ldf_kline_1min=pl_ldf_kline,\n",
+ " pl_ldf_kline_1day=pl_ldf_kline_daily,\n",
+ " pl_ldf_indus_idx=pl_ldf_industry,\n",
+ " dt_range=CONFIG['dt_range'],\n",
+ " normalization_mode=CONFIG['normalization_mode'],\n",
+ " negative_factor=CONFIG['negative_factor'],\n",
+ " positive_factor=CONFIG['positive_factor'],\n",
+ ")\n",
+ "\n",
+ "df_full = pl_df.to_pandas()\n",
+ "print(f'Full dataset shape: {df_full.shape}')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Train/Test Split and Model Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "feature_cols = [c for c in df_full.columns if c.startswith('alpha158_')]\n",
+ "target_cols = [c for c in df_full.columns if 'return' in c.lower()]\n",
+ "weight_cols = [c for c in df_full.columns if 'weight' in c.lower()]\n",
+ "\n",
+ "target_col = target_cols[0]\n",
+ "weight_col = weight_cols[0] if weight_cols else None\n",
+ "\n",
+ "df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n",
+ "df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n",
+ "\n",
+ "print(f'Train: {df_train.shape}, Test: {df_test.shape}')\n",
+ "print(f'Features: {len(feature_cols)}, Target: {target_col}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train = df_train[feature_cols].fillna(df_train[feature_cols].median())\n",
+ "y_train = df_train[target_col]\n",
+ "w_train = df_train[weight_col] if weight_col else None\n",
+ "\n",
+ "X_test = df_test[feature_cols].fillna(df_train[feature_cols].median())\n",
+ "y_test = df_test[target_col]\n",
+ "\n",
+ "print('Training XGBoost...')\n",
+ "model = xgb.XGBRegressor(**CONFIG['model_params'])\n",
+ "model.fit(X_train, y_train, sample_weight=w_train, verbose=False)\n",
+ "print('Training complete!')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Evaluation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "y_pred_test = model.predict(X_test)\n",
+ "\n",
+ "test_r2 = r2_score(y_test, y_pred_test)\n",
+ "test_ic = np.corrcoef(y_test, y_pred_test)[0, 1]\n",
+ "\n",
+ "print(f'Test R2: {test_r2:.4f}')\n",
+ "print(f'Test IC: {test_ic:.4f}')\n",
+ "\n",
+ "# Daily IC\n",
+ "df_test_eval = df_test.copy()\n",
+ "df_test_eval['pred'] = y_pred_test\n",
+ "df_test_eval['target'] = y_test\n",
+ "df_test_eval['datetime'] = df_test_eval.index.get_level_values(0)\n",
+ "\n",
+ "daily_ic = df_test_eval.groupby('datetime').apply(\n",
+ " lambda x: x['target'].corr(x['pred'])\n",
+ ")\n",
+ "\n",
+ "print(f'Daily IC Mean: {daily_ic.mean():.4f}')\n",
+ "print(f'Daily IC Std: {daily_ic.std():.4f}')\n",
+ "print(f'IR: {daily_ic.mean() / daily_ic.std():.4f}')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Plot daily IC\n",
+ "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
+ "\n",
+ "daily_ic.hist(bins=50, ax=axes[0], edgecolor='black')\n",
+ "axes[0].axvline(x=daily_ic.mean(), color='green', linestyle='--')\n",
+ "axes[0].set_title('Daily IC Distribution')\n",
+ "\n",
+ "daily_ic.rolling(20, min_periods=5).mean().plot(ax=axes[1])\n",
+ "axes[1].axhline(y=0, color='red', linestyle='--')\n",
+ "axes[1].set_title('Rolling IC (20-day)')\n",
+ "\n",
+ "plt.tight_layout()\n",
+ "plt.show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python",
+ "version": "3.8.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/stock_15m/README.md b/stock_15m/README.md
new file mode 100644
index 0000000..907eb83
--- /dev/null
+++ b/stock_15m/README.md
@@ -0,0 +1,25 @@
+# Stock 15-Minute Return Prediction
+
+Experiments for predicting stock 15-minute returns using high-frequency features.
+
+## Data
+
+- **Features**: alpha158 computed on 1-minute data
+- **Target**: 15-minute forward returns (close[t+16]/close[t+1]-1)
+- **Normalization**: industry, cs_zscore, or dual
+
+## Notebooks
+
+| Notebook | Purpose |
+|----------|---------|
+| `01_data_exploration.ipynb` | Load and explore 15m data structure |
+| `02_baseline_model.ipynb` | Train baseline XGBoost model |
+
+## Methodology
+
+1. Load 1-minute kline data via Polars lazy frames
+2. Compute/retrieve alpha158 features
+3. Calculate 15-minute forward returns
+4. Apply normalization (industry-neutralized or cross-sectional z-score)
+5. Train gradient boosting models
+6. Evaluate with IC and backtest
diff --git a/stock_15m/src/__init__.py b/stock_15m/src/__init__.py
new file mode 100644
index 0000000..7177588
--- /dev/null
+++ b/stock_15m/src/__init__.py
@@ -0,0 +1,3 @@
+"""Stock 15m task-specific utilities."""
+
+# Add task-specific functions here as needed