commit cdf63733256bfe51fd52bf907785e9fd4b0b2fab Author: guofu Date: Fri Feb 13 11:17:45 2026 +0800 Initial alpha_lab structure\n\n- Notebook-centric experiment framework\n- CTA 1D and Stock 15m tasks\n- Minimal common utilities\n- Manual experiment tracking diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..89ce364 --- /dev/null +++ b/.env.template @@ -0,0 +1,14 @@ +# Alpha Lab Environment Configuration +# Copy this file to .env and fill in your values + +# DolphinDB Configuration +DDB_HOST=192.168.1.146 +DDB_PORT=8848 +DDB_USERNAME= +DDB_PASSWORD= + +# Data Paths +DATA_ROOT=/data/parquet + +# Experiment Output +RESULTS_ROOT=/home/guofu/Workspaces/alpha_lab/results diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dd4c38b --- /dev/null +++ b/.gitignore @@ -0,0 +1,51 @@ +# Environment +.env +.venv/ +env/ +venv/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Jupyter +.ipynb_checkpoints/ +*.ipynb_checkpoints + +# Results and data +results/* +!results/*/.gitkeep +!results/*/README.md +*.parquet +*.pkl +*.h5 +*.feather + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db diff --git a/README.md b/README.md new file mode 100644 index 0000000..8deffe4 --- /dev/null +++ b/README.md @@ -0,0 +1,87 @@ +# Alpha Lab + +Quantitative research experiments for qshare library. This repository contains Jupyter notebooks and analysis scripts for exploring trading strategies and machine learning models. + +## Philosophy + +- **Notebook-centric**: Experiments are interactive notebooks, not rigid scripts +- **Minimal abstraction**: Simple functions over complex class hierarchies +- **Self-contained**: Each task directory is independent +- **Ad-hoc friendly**: Easy to modify for exploration + +## Structure + +``` +alpha_lab/ +├── common/ # Shared utilities (keep minimal!) +│ ├── paths.py # Path management +│ └── plotting.py # Common plotting functions +│ +├── cta_1d/ # CTA 1-day return prediction +│ ├── 01_data_check.ipynb +│ ├── 02_label_analysis.ipynb +│ ├── 03_baseline_xgb.ipynb +│ ├── 04_blend_comparison.ipynb +│ └── src/ # Task-specific helpers +│ +├── stock_15m/ # Stock 15-minute return prediction +│ ├── 01_data_exploration.ipynb +│ ├── 02_baseline_model.ipynb +│ └── src/ +│ +└── results/ # Output directory (gitignored) + ├── cta_1d/ + └── stock_15m/ +``` + +## Setup + +```bash +# Install dependencies +pip install -r requirements.txt + +# Create environment file +cp .env.template .env +# Edit .env with your settings +``` + +## Usage + +Start Jupyter and run notebooks interactively: + +```bash +jupyter notebook +``` + +Each task directory contains numbered notebooks: +- `01_*.ipynb` - Data loading and exploration +- `02_*.ipynb` - Analysis and baseline models +- `03_*.ipynb` - Advanced experiments +- `04_*.ipynb` - Comparisons and ablations + +## Experiment Tracking + +Experiments are tracked manually in `results/{task}/README.md`: + +```markdown +## 2025-01-15: Baseline XGB +- Notebook: `cta_1d/03_baseline_xgb.ipynb` (cells 1-50) +- Config: eta=0.5, lambda=0.1 +- Train IC: 0.042 +- Test IC: 0.038 +- Notes: Dual normalization, 4 trades/day +``` + +## Adding a New Task + +1. Create directory: `mkdir my_task` +2. Add `src/` subdirectory for helpers +3. Create numbered notebooks +4. Add entry to `results/my_task/README.md` + +## Best Practices + +1. **Keep it simple**: Only add to `common/` after 3+ copies +2. **Notebook configs**: Define CONFIG dict in first cell for easy modification +3. **Document results**: Update results README after significant runs +4. **Git discipline**: Don't commit large files, results, or credentials diff --git a/common/__init__.py b/common/__init__.py new file mode 100644 index 0000000..c749aa7 --- /dev/null +++ b/common/__init__.py @@ -0,0 +1,13 @@ +"""Common utilities for alpha_lab experiments.""" + +from .paths import ensure_dir, get_results_dir, get_task_results_dir +from .plotting import setup_plot_style, plot_ic_series, plot_cumulative_returns + +__all__ = [ + 'ensure_dir', + 'get_results_dir', + 'get_task_results_dir', + 'setup_plot_style', + 'plot_ic_series', + 'plot_cumulative_returns', +] diff --git a/common/paths.py b/common/paths.py new file mode 100644 index 0000000..63d2cd7 --- /dev/null +++ b/common/paths.py @@ -0,0 +1,42 @@ +"""Path utilities for experiment outputs.""" + +import os +from pathlib import Path +from datetime import datetime + +# Base directories +BASE_DIR = Path(__file__).parent.parent +RESULTS_DIR = BASE_DIR / "results" + + +def ensure_dir(path: Path) -> Path: + """Create directory if it doesn't exist.""" + path.mkdir(parents=True, exist_ok=True) + return path + + +def get_results_dir() -> Path: + """Get base results directory.""" + return ensure_dir(RESULTS_DIR) + + +def get_task_results_dir(task_name: str) -> Path: + """Get results directory for a specific task.""" + return ensure_dir(RESULTS_DIR / task_name) + + +def create_experiment_dir(task_name: str, experiment_name: str | None = None) -> Path: + """Create a timestamped directory for an experiment. + + Args: + task_name: Name of the task (e.g., 'cta_1d', 'stock_15m') + experiment_name: Optional experiment name (default: timestamp) + + Returns: + Path to the created directory + """ + if experiment_name is None: + experiment_name = datetime.now().strftime('%Y%m%d_%H%M%S') + + exp_dir = RESULTS_DIR / task_name / experiment_name + return ensure_dir(exp_dir) diff --git a/common/plotting.py b/common/plotting.py new file mode 100644 index 0000000..36e468a --- /dev/null +++ b/common/plotting.py @@ -0,0 +1,119 @@ +"""Common plotting utilities for experiments.""" + +import matplotlib.pyplot as plt +import seaborn as sns +import numpy as np +import pandas as pd + + +def setup_plot_style(): + """Set up default plotting style.""" + plt.style.use('seaborn-v0_8-whitegrid') + sns.set_palette("husl") + plt.rcParams['figure.figsize'] = (12, 6) + plt.rcParams['font.size'] = 10 + + +def plot_ic_series(ic_by_date: pd.Series, title: str = "IC Over Time", + figsize: tuple = (14, 4)) -> plt.Figure: + """Plot IC time series with rolling mean. + + Args: + ic_by_date: Series with datetime index and IC values + title: Plot title + figsize: Figure size + + Returns: + Matplotlib figure + """ + fig, ax = plt.subplots(figsize=figsize) + + # Plot raw IC + ax.plot(ic_by_date.index, ic_by_date.values, alpha=0.5, color='gray', label='Daily IC') + + # Plot rolling mean + rolling = ic_by_date.rolling(20, min_periods=5).mean() + ax.plot(rolling.index, rolling.values, color='blue', linewidth=2, label='20-day MA') + + # Add mean line + mean_ic = ic_by_date.mean() + ax.axhline(y=mean_ic, color='red', linestyle='--', + label=f'Mean IC: {mean_ic:.4f}') + ax.axhline(y=0, color='black', linestyle='-', alpha=0.3) + + ax.set_title(title) + ax.set_xlabel('Date') + ax.set_ylabel('Information Coefficient') + ax.legend(loc='upper right') + + plt.tight_layout() + return fig + + +def plot_cumulative_returns(returns: pd.Series, title: str = "Cumulative Returns", + figsize: tuple = (12, 6)) -> plt.Figure: + """Plot cumulative returns. + + Args: + returns: Series with datetime index and daily returns + title: Plot title + figsize: Figure size + + Returns: + Matplotlib figure + """ + fig, ax = plt.subplots(figsize=figsize) + + cumulative = (1 + returns).cumprod() + ax.plot(cumulative.index, cumulative.values, linewidth=1.5) + + ax.set_title(title) + ax.set_xlabel('Date') + ax.set_ylabel('Cumulative Return') + ax.set_yscale('log') + + # Add final return annotation + final_return = cumulative.iloc[-1] - 1 + ax.annotate(f'{final_return:.2%}', + xy=(cumulative.index[-1], cumulative.iloc[-1]), + xytext=(10, 0), textcoords='offset points', + fontsize=10, color='green' if final_return > 0 else 'red') + + plt.tight_layout() + return fig + + +def plot_factor_distribution(factor: pd.Series, title: str = "Factor Distribution", + figsize: tuple = (10, 6)) -> plt.Figure: + """Plot factor distribution with statistics. + + Args: + factor: Series of factor values + title: Plot title + figsize: Figure size + + Returns: + Matplotlib figure + """ + fig, axes = plt.subplots(1, 2, figsize=figsize) + + # Histogram + axes[0].hist(factor.dropna(), bins=100, alpha=0.7, edgecolor='black') + axes[0].set_title(f'{title} - Distribution') + axes[0].set_xlabel('Value') + axes[0].set_ylabel('Frequency') + + # Q-Q plot + from scipy import stats + stats.probplot(factor.dropna(), dist="norm", plot=axes[1]) + axes[1].set_title(f'{title} - Q-Q Plot') + + # Add statistics text + stats_text = f"Mean: {factor.mean():.4f}\nStd: {factor.std():.4f}\n" + stats_text += f"Skew: {factor.skew():.4f}\nKurt: {factor.kurtosis():.4f}" + axes[0].text(0.95, 0.95, stats_text, transform=axes[0].transAxes, + verticalalignment='top', horizontalalignment='right', + bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5)) + + plt.tight_layout() + return fig diff --git a/cta_1d/01_data_check.ipynb b/cta_1d/01_data_check.ipynb new file mode 100644 index 0000000..6e42050 --- /dev/null +++ b/cta_1d/01_data_check.ipynb @@ -0,0 +1,277 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CTA 1D Data Check\n", + "\n", + "Load and validate CTA futures data.\n", + "\n", + "**Purpose**: Verify data availability, check basic statistics, and understand data structure before modeling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from qshare.data.pandas.cta_1d import load_dataset\n", + "from qshare.io.ddb.cta import load_cta_alpha158, load_cta_hffactors, load_cta_returns\n", + "\n", + "import sys\n", + "sys.path.insert(0, '../')\n", + "from common.plotting import setup_plot_style\n", + "\n", + "setup_plot_style()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Configuration\n", + "\n", + "Modify these parameters as needed for your data check." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CONFIG = {\n", + " 'dt_range': ['2020-01-01', '2024-12-31'],\n", + " 'feature_sets': ['alpha158', 'hffactor'],\n", + " 'return_type': 'o2c_twap1min', # or 'o2o_twap1min'\n", + " 'normalization': 'dual',\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load Features Separately\n", + "\n", + "Check each feature set independently." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load alpha158 features\n", + "print(\"Loading alpha158 features...\")\n", + "df_alpha158 = load_cta_alpha158(\n", + " since_date=CONFIG['dt_range'][0],\n", + " end_date=CONFIG['dt_range'][1],\n", + ")\n", + "print(f\"alpha158 shape: {df_alpha158.shape}\")\n", + "print(f\"\")\n", + "print(f\"Columns: {list(df_alpha158.columns[:10])}...\") # First 10 columns\n", + "print(f\"\")\n", + "print(f\"Date range: {df_alpha158.index.get_level_values(0).min()} to {df_alpha158.index.get_level_values(0).max()}\")\n", + "print(f\"Instruments: {df_alpha158.index.get_level_values(1).nunique()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load HF factors\n", + "print(\"Loading hffactor features...\")\n", + "df_hf = load_cta_hffactors(\n", + " since_date=CONFIG['dt_range'][0],\n", + " end_date=CONFIG['dt_range'][1],\n", + ")\n", + "print(f\"hffactor shape: {df_hf.shape}\")\n", + "print(f\"\")\n", + "print(f\"Columns: {list(df_hf.columns[:10])}...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Load Returns (Labels)\n", + "\n", + "Check return indicators that will be used as prediction targets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load return indicators\n", + "print(\"Loading return indicators...\")\n", + "df_returns = load_cta_returns(\n", + " since_date=CONFIG['dt_range'][0],\n", + " end_date=CONFIG['dt_range'][1],\n", + ")\n", + "print(f\"Returns shape: {df_returns.shape}\")\n", + "print(f\"\")\n", + "print(f\"Available return types:\")\n", + "for col in df_returns.columns:\n", + " print(f\" - {col}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check specific return type\n", + "return_col = CONFIG['return_type']\n", + "if return_col in df_returns.columns:\n", + " print(f\"\\n{return_col} statistics:\")\n", + " print(df_returns[return_col].describe())\n", + " \n", + " # Plot distribution\n", + " fig, ax = plt.subplots(figsize=(10, 4))\n", + " df_returns[return_col].hist(bins=100, ax=ax, edgecolor='black')\n", + " ax.set_title(f'{return_col} Distribution')\n", + " ax.axvline(x=0, color='red', linestyle='--')\n", + " plt.show()\n", + "else:\n", + " print(f\"Warning: {return_col} not found in returns data\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Load Full Dataset\n", + "\n", + "Load the complete training dataset with features and labels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load full dataset\n", + "print(\"Loading full dataset...\")\n", + "df_full = load_dataset(\n", + " dt_range=CONFIG['dt_range'],\n", + " return_type=CONFIG['return_type'],\n", + " normalization=CONFIG['normalization'],\n", + " feature_sets=CONFIG['feature_sets'],\n", + ")\n", + "\n", + "print(f\"\\nFull dataset shape: {df_full.shape}\")\n", + "print(f\"\")\n", + "print(f\"Columns: {len(df_full.columns)} total\")\n", + "print(f\" - Features: {len([c for c in df_full.columns if c.startswith(('alpha158_', 'hf_'))])}\")\n", + "print(f\" - Label: 'label'\")\n", + "print(f\" - Weight: 'weight'\")\n", + "print(f\" - Return: 'return'\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check for missing values\n", + "missing = df_full.isnull().sum()\n", + "missing_cols = missing[missing > 0]\n", + "\n", + "if len(missing_cols) > 0:\n", + " print(f\"\\nColumns with missing values:\")\n", + " print(missing_cols.head(10))\n", + "else:\n", + " print(\"\\nNo missing values found!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Label statistics\n", + "print(\"\\nLabel statistics:\")\n", + "print(df_full['label'].describe())\n", + "\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", + "\n", + "# Distribution\n", + "df_full['label'].hist(bins=100, ax=axes[0], edgecolor='black')\n", + "axes[0].set_title('Label Distribution')\n", + "axes[0].axvline(x=0, color='red', linestyle='--')\n", + "\n", + "# Time series of mean label by date\n", + "label_by_date = df_full.groupby(level=0)['label'].mean()\n", + "axes[1].plot(label_by_date.index, label_by_date.values)\n", + "axes[1].set_title('Mean Label by Date')\n", + "axes[1].axhline(y=0, color='red', linestyle='--')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Summary\n", + "\n", + "Check data availability by instrument and date." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Data availability heatmap\n", + "available = df_full.groupby([df_full.index.get_level_values(0).date, df_full.index.get_level_values(1)]).size().unstack(fill_value=0)\n", + "available = (available > 0).astype(int)\n", + "\n", + "print(f\"Data availability: {available.sum().sum()} instrument-date pairs\")\n", + "print(f\"Instruments: {len(available.columns)}\")\n", + "print(f\"Dates: {len(available.index)}\")\n", + "\n", + "# Plot coverage\n", + "fig, ax = plt.subplots(figsize=(14, 6))\n", + "im = ax.imshow(available.T.values, aspect='auto', cmap='RdYlGn', interpolation='nearest')\n", + "ax.set_title('Data Availability (Green=Available, Red=Missing)')\n", + "ax.set_xlabel('Time')\n", + "ax.set_ylabel('Instrument')\n", + "plt.colorbar(im, ax=ax)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/cta_1d/02_label_analysis.ipynb b/cta_1d/02_label_analysis.ipynb new file mode 100644 index 0000000..ff6ac29 --- /dev/null +++ b/cta_1d/02_label_analysis.ipynb @@ -0,0 +1,319 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CTA 1D Label Analysis\n", + "\n", + "Explore label distributions and compare different normalization blending strategies.\n", + "\n", + "**Purpose**: Understand how different normalization methods affect label distributions and identify optimal blending." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from qshare.data.pandas.cta_1d.dataset import load_features, load_label\n", + "from qshare.data.pandas.cta_1d.label import normalize_label_dual, normalize_label\n", + "from qshare.io.ddb.cta import load_cta_returns\n", + "\n", + "import sys\n", + "sys.path.insert(0, '../')\n", + "from common.plotting import setup_plot_style\n", + "from src.labels import BLEND_CONFIGS, get_blend_weights, describe_blend_config\n", + "\n", + "setup_plot_style()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CONFIG = {\n", + " 'dt_range': ['2020-01-01', '2024-12-31'],\n", + " 'fit_range': ['2020-01-01', '2021-12-31'], # For zscore normalization\n", + " 'return_type': 'o2c_twap1min',\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load Raw Returns\n", + "\n", + "Load the raw return series before any normalization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load returns\n", + "print(\"Loading raw returns...\")\n", + "df_returns = load_cta_returns(\n", + " since_date=CONFIG['dt_range'][0],\n", + " end_date=CONFIG['dt_range'][1],\n", + ")\n", + "\n", + "return_col = CONFIG['return_type']\n", + "raw_returns = df_returns[return_col].copy()\n", + "\n", + "print(f\"\\nRaw {return_col} returns:\")\n", + "print(raw_returns.describe())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot raw return distribution\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", + "\n", + "# Histogram\n", + "raw_returns.hist(bins=100, ax=axes[0], edgecolor='black')\n", + "axes[0].set_title(f'Raw {return_col} Distribution')\n", + "axes[0].axvline(x=0, color='red', linestyle='--')\n", + "\n", + "# Time series\n", + "daily_mean = raw_returns.groupby(level=0).mean()\n", + "axes[1].plot(daily_mean.index, daily_mean.values)\n", + "axes[1].set_title('Daily Mean Return')\n", + "axes[1].axhline(y=0, color='red', linestyle='--')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Compare Normalization Methods\n", + "\n", + "Apply each normalization method individually and compare." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load dominant contract mapping for proper label construction\n", + "from qshare.io.ddb.cta import load_cta_dominant_contracts\n", + "\n", + "print(\"Loading dominant contract mapping...\")\n", + "df_dominant = load_cta_dominant_contracts(\n", + " since_date=CONFIG['dt_range'][0],\n", + " end_date=CONFIG['dt_range'][1],\n", + ")\n", + "\n", + "# Merge returns with dominant mapping\n", + "df_merged = df_dominant.join(raw_returns, how='left')\n", + "\n", + "# Calculate different normalization methods\n", + "print(\"\\nApplying normalization methods...\")\n", + "\n", + "norm_results = {}\n", + "\n", + "# zscore (fit-time)\n", + "norm_results['zscore'] = normalize_label(\n", + " df_merged[return_col],\n", + " method='zscore',\n", + " fit_range=CONFIG['fit_range']\n", + ")\n", + "\n", + "# cs_zscore (cross-sectional)\n", + "norm_results['cs_zscore'] = df_merged.groupby(level=0)[return_col].apply(\n", + " lambda x: (x - x.mean()) / (x.std() + 1e-8)\n", + ")\n", + "\n", + "# rolling_20\n", + "norm_results['rolling_20'] = normalize_label(\n", + " df_merged[return_col],\n", + " method='rolling',\n", + " window=20\n", + ")\n", + "\n", + "# rolling_60\n", + "norm_results['rolling_60'] = normalize_label(\n", + " df_merged[return_col],\n", + " method='rolling',\n", + " window=60\n", + ")\n", + "\n", + "print(\"Done!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compare distributions\n", + "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n", + "axes = axes.flatten()\n", + "\n", + "for i, (method, series) in enumerate(norm_results.items()):\n", + " ax = axes[i]\n", + " series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n", + " ax.set_title(f'{method}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n", + " ax.axvline(x=0, color='red', linestyle='--')\n", + " ax.set_xlim(-5, 5) # Focus on main distribution\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Compare Blend Configurations\n", + "\n", + "Compare different blending strategies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Apply each blend configuration\n", + "blend_results = {}\n", + "\n", + "for name in BLEND_CONFIGS.keys():\n", + " weights = get_blend_weights(name)\n", + " print(f\"\\nProcessing {name}: {weights}\")\n", + " \n", + " # Calculate blended label\n", + " blended = (\n", + " weights[0] * norm_results['zscore'] +\n", + " weights[1] * norm_results['cs_zscore'] +\n", + " weights[2] * norm_results['rolling_20'] +\n", + " weights[3] * norm_results['rolling_60']\n", + " )\n", + " \n", + " blend_results[name] = blended\n", + " print(f\" Mean: {blended.mean():.4f}, Std: {blended.std():.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visualize all blend distributions\n", + "fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n", + "axes = axes.flatten()\n", + "\n", + "for i, (name, series) in enumerate(blend_results.items()):\n", + " ax = axes[i]\n", + " series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n", + " weights = get_blend_weights(name)\n", + " ax.set_title(f'{name}\\nweights={weights}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n", + " ax.axvline(x=0, color='red', linestyle='--')\n", + " ax.set_xlim(-5, 5)\n", + "\n", + "# Hide last subplot if not used\n", + "if len(blend_results) < 6:\n", + " axes[-1].axis('off')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Correlation Analysis\n", + "\n", + "Check correlations between different normalization methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create comparison DataFrame\n", + "comparison_df = pd.DataFrame(norm_results)\n", + "\n", + "# Add raw returns\n", + "comparison_df['raw'] = df_merged[return_col]\n", + "\n", + "# Calculate correlation matrix\n", + "corr = comparison_df.corr()\n", + "\n", + "# Plot heatmap\n", + "fig, ax = plt.subplots(figsize=(8, 6))\n", + "sns.heatmap(corr, annot=True, cmap='RdBu_r', center=0,\n", + " vmin=-1, vmax=1, ax=ax)\n", + "ax.set_title('Correlation: Normalization Methods')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "source": [ + "# Rolling correlation analysis\n", + "window = 60\n", + "\n", + "# Calculate rolling correlation between zscore and cs_zscore\n", + "rolling_corr = norm_results['zscore'].rolling(window).corr(norm_results['cs_zscore'])\n", + "\n", + "fig, ax = plt.subplots(figsize=(14, 4))\n", + "ax.plot(rolling_corr.index.get_level_values(0).unique(), rolling_corr.groupby(level=0).mean())\n", + "ax.set_title(f'Rolling Correlation: zscore vs cs_zscore ({window}d window)')\n", + "ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.5)\n", + "ax.set_ylim(-1, 1)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/cta_1d/03_baseline_xgb.ipynb b/cta_1d/03_baseline_xgb.ipynb new file mode 100644 index 0000000..d77d3ca --- /dev/null +++ b/cta_1d/03_baseline_xgb.ipynb @@ -0,0 +1,356 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CTA 1D Baseline XGBoost Model\n", + "\n", + "Train and evaluate a baseline XGBoost model for CTA 1-day return prediction.\n", + "\n", + "**Purpose**: Establish a baseline performance benchmark with standard configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import json\n", + "from datetime import datetime\n", + "\n", + "from qshare.data.pandas.cta_1d import load_dataset\n", + "from qshare.algo.learning.cta_trainer import CTAXGBTrainer\n", + "from qshare.eval.cta.backtest import CTABacktester\n", + "\n", + "import sys\n", + "sys.path.insert(0, '../')\n", + "from common.plotting import setup_plot_style, plot_ic_series, plot_cumulative_returns\n", + "from common.paths import create_experiment_dir\n", + "from src.labels import get_blend_weights, describe_blend_config\n", + "\n", + "setup_plot_style()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Configuration\n", + "\n", + "Edit this cell to modify experiment parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CONFIG = {\n", + " # Experiment\n", + " 'experiment_name': 'baseline_xgb', # Will be appended with timestamp\n", + " \n", + " # Date ranges\n", + " 'dt_range': ['2020-01-01', '2024-12-31'],\n", + " 'train_range': ['2020-01-01', '2022-12-31'],\n", + " 'test_range': ['2023-01-01', '2024-12-31'],\n", + " 'fit_range': ['2020-01-01', '2021-06-30'], # For normalization fitting\n", + " \n", + " # Data\n", + " 'feature_sets': ['alpha158', 'hffactor'],\n", + " 'return_type': 'o2c_twap1min',\n", + " 'normalization': 'dual',\n", + " 'blend_weights': None, # Use default [0.2, 0.1, 0.3, 0.4] or specify name/list\n", + " 'weight_factors': {'positive': 1.0, 'negative': 2.0},\n", + " \n", + " # Model\n", + " 'xgb_params': {\n", + " 'booster': 'gblinear',\n", + " 'eta': 0.5,\n", + " 'lambda_reg': 0.1,\n", + " 'num_round': 20,\n", + " },\n", + " \n", + " # Backtest\n", + " 'backtest_params': {\n", + " 'num_trades': 4,\n", + " 'signal_dist': 'normal',\n", + " 'pos_weight': True,\n", + " },\n", + " \n", + " # Output\n", + " 'save_results': True,\n", + "}\n", + "\n", + "print(\"Configuration:\")\n", + "print(f\" Experiment: {CONFIG['experiment_name']}\")\n", + "print(f\" Train: {CONFIG['train_range'][0]} to {CONFIG['train_range'][1]}\")\n", + "print(f\" Test: {CONFIG['test_range'][0]} to {CONFIG['test_range'][1]}\")\n", + "print(f\" Blend: {describe_blend_config(CONFIG['blend_weights'] or 'default')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Loading dataset...\")\n", + "df_full = load_dataset(\n", + " dt_range=CONFIG['dt_range'],\n", + " return_type=CONFIG['return_type'],\n", + " normalization=CONFIG['normalization'],\n", + " feature_sets=CONFIG['feature_sets'],\n", + " fit_range=CONFIG['fit_range'],\n", + " weight_factors=CONFIG['weight_factors'],\n", + " blend_weights=CONFIG['blend_weights'],\n", + ")\n", + "\n", + "print(f\"\\nDataset shape: {df_full.shape}\")\n", + "print(f\"Columns: {len(df_full.columns)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Split train/test\n", + "df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n", + "df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n", + "\n", + "print(f\"Train: {df_train.shape}\")\n", + "print(f\"Test: {df_test.shape}\")\n", + "\n", + "# Get feature columns\n", + "feature_cols = [c for c in df_train.columns\n", + " if c.startswith(('alpha158_', 'hf_', 'f_'))]\n", + "print(f\"\\nFeatures: {len(feature_cols)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Train Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Training XGBoost model...\")\n", + "print(f\" Params: {CONFIG['xgb_params']}\")\n", + "\n", + "trainer = CTAXGBTrainer(**CONFIG['xgb_params'])\n", + "\n", + "trainer.fit(\n", + " df_train,\n", + " feature_cols=feature_cols,\n", + " target_col='label',\n", + " weight_col='weight'\n", + ")\n", + "\n", + "print(\"\\nTraining complete!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature importance\n", + "importance = trainer.get_feature_importance()\n", + "print(\"\\nTop 10 Features:\")\n", + "print(importance.head(10))\n", + "\n", + "# Plot\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "importance.head(20).plot(kind='barh', ax=ax)\n", + "ax.set_title('Top 20 Feature Importance')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Generate Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Generating predictions on test set...\")\n", + "df_signal = trainer.predict(df_test)\n", + "\n", + "print(f\"\\nSignal statistics:\")\n", + "print(df_signal.describe())\n", + "\n", + "# Plot signal distribution\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", + "\n", + "df_signal.hist(bins=100, ax=axes[0], edgecolor='black')\n", + "axes[0].set_title('Signal Distribution')\n", + "axes[0].axvline(x=0, color='red', linestyle='--')\n", + "\n", + "signal_by_date = df_signal.groupby(level=0).mean()\n", + "axes[1].plot(signal_by_date.index, signal_by_date.values)\n", + "axes[1].set_title('Mean Signal by Date')\n", + "axes[1].axhline(y=0, color='red', linestyle='--')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Evaluate with Backtest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Running backtest...\")\n", + "\n", + "returns = df_test['return'] if 'return' in df_test.columns else df_test['label']\n", + "\n", + "backtester = CTABacktester(**CONFIG['backtest_params'])\n", + "results = backtester.run(returns, df_signal)\n", + "\n", + "summary = backtester.summary()\n", + "print(\"\\nBacktest Summary:\")\n", + "for key, value in summary.items():\n", + " if isinstance(value, float):\n", + " print(f\" {key}: {value:.4f}\")\n", + " else:\n", + " print(f\" {key}: {value}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# IC Analysis\n", + "ic_by_date = results.groupby(results.index.get_level_values(0))['ic'].mean()\n", + "\n", + "fig = plot_ic_series(ic_by_date, title=\"IC Over Time (Test Set)\")\n", + "plt.show()\n", + "\n", + "print(f\"\\nIC Statistics:\")\n", + "print(f\" Mean: {ic_by_date.mean():.4f}\")\n", + "print(f\" Std: {ic_by_date.std():.4f}\")\n", + "print(f\" IR: {ic_by_date.mean() / ic_by_date.std():.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Cumulative returns\n", + "daily_returns = results.groupby(results.index.get_level_values(0))['pos_return'].mean()\n", + "\n", + "fig = plot_cumulative_returns(daily_returns, title=\"Cumulative Strategy Returns\")\n", + "plt.show()\n", + "\n", + "total_return = (1 + daily_returns).prod() - 1\n", + "annual_return = (1 + total_return) ** (252 / len(daily_returns)) - 1\n", + "sharpe = daily_returns.mean() / daily_returns.std() * np.sqrt(252)\n", + "\n", + "print(f\"\\nReturn Statistics:\")\n", + "print(f\" Total Return: {total_return:.2%}\")\n", + "print(f\" Annual Return: {annual_return:.2%}\")\n", + "print(f\" Sharpe Ratio: {sharpe:.2f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Save Results\n", + "\n", + "Save model, predictions, and metrics for later analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if CONFIG['save_results']:\n", + " # Create output directory\n", + " output_dir = create_experiment_dir('cta_1d', CONFIG['experiment_name'])\n", + " print(f\"Saving results to: {output_dir}\")\n", + " \n", + " # Save config\n", + " with open(output_dir / 'config.json', 'w') as f:\n", + " json.dump(CONFIG, f, indent=2, default=str)\n", + " \n", + " # Save model\n", + " trainer.save_model(str(output_dir / 'model.pkl'))\n", + " \n", + " # Save feature importance\n", + " importance.to_csv(output_dir / 'feature_importance.csv')\n", + " \n", + " # Save predictions\n", + " df_signal.to_csv(output_dir / 'predictions.csv')\n", + " \n", + " # Save backtest results\n", + " results.to_csv(output_dir / 'backtest_results.csv')\n", + " \n", + " # Save summary\n", + " with open(output_dir / 'summary.json', 'w') as f:\n", + " json.dump(summary, f, indent=2, default=str)\n", + " \n", + " print(\"\\nFiles saved:\")\n", + " for f in output_dir.iterdir():\n", + " print(f\" - {f.name}\")\n", + "else:\n", + " print(\"Results not saved (save_results=False)\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/cta_1d/04_blend_comparison.ipynb b/cta_1d/04_blend_comparison.ipynb new file mode 100644 index 0000000..220a535 --- /dev/null +++ b/cta_1d/04_blend_comparison.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CTA 1D Blend Comparison\n", + "\n", + "Compare model performance across different label blending configurations.\n", + "\n", + "**Purpose**: Identify the optimal normalization blend for the CTA 1-day prediction task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from qshare.data.pandas.cta_1d import load_dataset\n", + "from qshare.algo.learning.cta_trainer import CTAXGBTrainer\n", + "from qshare.eval.cta.backtest import CTABacktester\n", + "\n", + "import sys\n", + "sys.path.insert(0, '../')\n", + "from common.plotting import setup_plot_style, plot_ic_series\n", + "from src.labels import BLEND_CONFIGS, get_blend_weights\n", + "\n", + "setup_plot_style()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Configuration\n", + "\n", + "Define base configuration shared across all blend experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BASE_CONFIG = {\n", + " # Date ranges\n", + " 'dt_range': ['2020-01-01', '2024-12-31'],\n", + " 'train_range': ['2020-01-01', '2022-12-31'],\n", + " 'test_range': ['2023-01-01', '2024-12-31'],\n", + " 'fit_range': ['2020-01-01', '2021-06-30'],\n", + " \n", + " # Data\n", + " 'feature_sets': ['alpha158', 'hffactor'],\n", + " 'return_type': 'o2c_twap1min',\n", + " 'normalization': 'dual',\n", + " 'weight_factors': {'positive': 1.0, 'negative': 2.0},\n", + " \n", + " # Model (fixed for fair comparison)\n", + " 'xgb_params': {\n", + " 'booster': 'gblinear',\n", + " 'eta': 0.5,\n", + " 'lambda_reg': 0.1,\n", + " 'num_round': 20,\n", + " },\n", + " \n", + " # Backtest\n", + " 'backtest_params': {\n", + " 'num_trades': 4,\n", + " 'signal_dist': 'normal',\n", + " 'pos_weight': True,\n", + " },\n", + "}\n", + "\n", + "print(\"Blend configurations to compare:\")\n", + "for name, weights in BLEND_CONFIGS.items():\n", + " print(f\" {name}: {weights}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Run Experiments\n", + "\n", + "Train and evaluate a model for each blend configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def run_single_experiment(blend_name, blend_weights):\n", + " \"\"\"Run experiment with specific blend configuration.\"\"\"\n", + " print(f\"\\n{'='*60}\")\n", + " print(f\"Running: {blend_name}\")\n", + " print(f\"Weights: {blend_weights}\")\n", + " print(f\"{'='*60}\")\n", + " \n", + " # Load data\n", + " df_full = load_dataset(\n", + " dt_range=BASE_CONFIG['dt_range'],\n", + " return_type=BASE_CONFIG['return_type'],\n", + " normalization=BASE_CONFIG['normalization'],\n", + " feature_sets=BASE_CONFIG['feature_sets'],\n", + " fit_range=BASE_CONFIG['fit_range'],\n", + " weight_factors=BASE_CONFIG['weight_factors'],\n", + " blend_weights=blend_weights,\n", + " )\n", + " \n", + " # Split\n", + " df_train = df_full.loc[BASE_CONFIG['train_range'][0]:BASE_CONFIG['train_range'][1]]\n", + " df_test = df_full.loc[BASE_CONFIG['test_range'][0]:BASE_CONFIG['test_range'][1]]\n", + " \n", + " # Features\n", + " feature_cols = [c for c in df_train.columns\n", + " if c.startswith(('alpha158_', 'hf_', 'f_'))]\n", + " \n", + " # Train\n", + " trainer = CTAXGBTrainer(**BASE_CONFIG['xgb_params'])\n", + " trainer.fit(\n", + " df_train,\n", + " feature_cols=feature_cols,\n", + " target_col='label',\n", + " weight_col='weight'\n", + " )\n", + " \n", + " # Predict\n", + " df_signal = trainer.predict(df_test)\n", + " \n", + " # Backtest\n", + " returns = df_test['return'] if 'return' in df_test.columns else df_test['label']\n", + " backtester = CTABacktester(**BASE_CONFIG['backtest_params'])\n", + " results = backtester.run(returns, df_signal)\n", + " \n", + " # Metrics\n", + " summary = backtester.summary()\n", + " ic_by_date = results.groupby(results.index.get_level_values(0))['ic'].mean()\n", + " \n", + " return {\n", + " 'name': blend_name,\n", + " 'weights': blend_weights,\n", + " 'summary': summary,\n", + " 'ic_by_date': ic_by_date,\n", + " 'results': results,\n", + " 'importance': trainer.get_feature_importance(),\n", + " }\n", + "\n", + "# Run all experiments\n", + "all_results = []\n", + "for name in BLEND_CONFIGS.keys():\n", + " result = run_single_experiment(name, name)\n", + " all_results.append(result)\n", + " \n", + "print(\"\\n\\nAll experiments complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Results Summary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create comparison table\n", + "comparison_data = []\n", + "for r in all_results:\n", + " ic_mean = r['ic_by_date'].mean()\n", + " ic_std = r['ic_by_date'].std()\n", + " comparison_data.append({\n", + " 'Blend': r['name'],\n", + " 'Weights': str(r['weights']),\n", + " 'IC Mean': ic_mean,\n", + " 'IC Std': ic_std,\n", + " 'IR': ic_mean / ic_std if ic_std > 0 else 0,\n", + " 'Return': r['summary'].get('return', np.nan),\n", + " 'Sharpe': r['summary'].get('sharpe', np.nan),\n", + " 'Turnover': r['summary'].get('turnover', np.nan),\n", + " })\n", + "\n", + "df_comparison = pd.DataFrame(comparison_data)\n", + "\n", + "# Sort by IC Mean\n", + "df_comparison = df_comparison.sort_values('IC Mean', ascending=False)\n", + "\n", + "print(\"Comparison Summary (sorted by IC Mean):\")\n", + "print(df_comparison.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Visual comparison\n", + "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n", + "\n", + "# IC Mean\n", + "axes[0, 0].barh(df_comparison['Blend'], df_comparison['IC Mean'])\n", + "axes[0, 0].set_title('IC Mean')\n", + "axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n", + "\n", + "# Information Ratio\n", + "axes[0, 1].barh(df_comparison['Blend'], df_comparison['IR'])\n", + "axes[0, 1].set_title('Information Ratio')\n", + "axes[0, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n", + "\n", + "# Return\n", + "axes[1, 0].barh(df_comparison['Blend'], df_comparison['Return'])\n", + "axes[1, 0].set_title('Return')\n", + "axes[1, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n", + "\n", + "# Sharpe\n", + "axes[1, 1].barh(df_comparison['Blend'], df_comparison['Sharpe'])\n", + "axes[1, 1].set_title('Sharpe Ratio')\n", + "axes[1, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. IC Time Series Comparison" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot IC series for all configurations\n", + "fig, ax = plt.subplots(figsize=(16, 6))\n", + "\n", + "for r in all_results:\n", + " ic_rolling = r['ic_by_date'].rolling(20, min_periods=5).mean()\n", + " ax.plot(ic_rolling.index, ic_rolling.values, label=r['name'], alpha=0.8)\n", + "\n", + "ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)\n", + "ax.set_title('Rolling IC Comparison (20-day MA)')\n", + "ax.set_xlabel('Date')\n", + "ax.set_ylabel('Information Coefficient')\n", + "ax.legend(loc='upper right')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Feature Importance Comparison" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get top features from each blend\n", + "n_top = 10\n", + "top_features_by_blend = {}\n", + "\n", + "for r in all_results:\n", + " top_features_by_blend[r['name']] = set(r['importance'].head(n_top).index)\n", + "\n", + "# Find common features across all blends\n", + "common_features = set.intersection(*top_features_by_blend.values())\n", + "print(f\"\\nCommon top-{n_top} features across all blends:\")\n", + "for f in sorted(common_features):\n", + " print(f\" - {f}\")\n", + "\n", + "# Find unique features per blend\n", + "print(\"\\nUnique top features by blend:\")\n", + "for name, features in top_features_by_blend.items():\n", + " unique = features - set.union(*(top_features_by_blend.values() - {features}))\n", + " if unique:\n", + " print(f\"\\n {name}:\")\n", + " for f in sorted(unique):\n", + " print(f\" - {f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Heatmap of top feature importance across blends\n", + "all_top_features = set.union(*top_features_by_blend.values())\n", + "\n", + "importance_matrix = []\n", + "for r in all_results:\n", + " row = []\n", + " for f in sorted(all_top_features):\n", + " if f in r['importance'].index:\n", + " row.append(r['importance'].loc[f, 'importance'])\n", + " else:\n", + " row.append(0)\n", + " importance_matrix.append(row)\n", + "\n", + "df_importance = pd.DataFrame(\n", + " importance_matrix,\n", + " index=[r['name'] for r in all_results],\n", + " columns=sorted(all_top_features)\n", + ")\n", + "\n", + "fig, ax = plt.subplots(figsize=(14, 6))\n", + "sns.heatmap(df_importance, cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Importance'})\n", + "ax.set_title('Feature Importance Comparison Across Blends')\n", + "ax.set_xlabel('Features')\n", + "ax.set_ylabel('Blend Configuration')\n", + "plt.xticks(rotation=45, ha='right')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Custom Weight Exploration\n", + "\n", + "Test custom blend weights." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define custom weights to test\n", + "CUSTOM_WEIGHTS = [\n", + " [0.0, 0.0, 0.5, 0.5], # Only rolling\n", + " [0.3, 0.3, 0.2, 0.2], # Fit-time heavy\n", + " [0.1, 0.4, 0.25, 0.25], # CS heavy + balanced rolling\n", + "]\n", + "\n", + "custom_results = []\n", + "for i, weights in enumerate(CUSTOM_WEIGHTS):\n", + " result = run_single_experiment(f'custom_{i+1}', weights)\n", + " custom_results.append(result)\n", + "\n", + "print(\"\\n\\nCustom weights experiments complete!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Compare custom with standard\n", + "all_comparison_data = comparison_data.copy()\n", + "\n", + "for r in custom_results:\n", + " ic_mean = r['ic_by_date'].mean()\n", + " ic_std = r['ic_by_date'].std()\n", + " all_comparison_data.append({\n", + " 'Blend': r['name'],\n", + " 'Weights': str(r['weights']),\n", + " 'IC Mean': ic_mean,\n", + " 'IC Std': ic_std,\n", + " 'IR': ic_mean / ic_std if ic_std > 0 else 0,\n", + " 'Return': r['summary'].get('return', np.nan),\n", + " 'Sharpe': r['summary'].get('sharpe', np.nan),\n", + " 'Turnover': r['summary'].get('turnover', np.nan),\n", + " })\n", + "\n", + "df_all = pd.DataFrame(all_comparison_data)\n", + "df_all = df_all.sort_values('IC Mean', ascending=False)\n", + "\n", + "print(\"All Results (standard + custom):\")\n", + "print(df_all.to_string(index=False))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Conclusion\n", + "\n", + "Summarize findings and recommend best blend configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Best configuration\n", + "best = df_comparison.iloc[0]\n", + "print(\"Recommended Blend Configuration:\")\n", + "print(f\" Name: {best['Blend']}\")\n", + "print(f\" Weights: {best['Weights']}\")\n", + "print(f\"\\nPerformance:\")\n", + "print(f\" IC Mean: {best['IC Mean']:.4f}\")\n", + "print(f\" IC Std: {best['IC Std']:.4f}\")\n", + "print(f\" IR: {best['IR']:.4f}\")\n", + "print(f\" Return: {best['Return']:.4f}\")\n", + "print(f\" Sharpe: {best['Sharpe']:.4f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/cta_1d/README.md b/cta_1d/README.md new file mode 100644 index 0000000..664105a --- /dev/null +++ b/cta_1d/README.md @@ -0,0 +1,36 @@ +# CTA 1-Day Return Prediction + +Experiments for predicting CTA (Commodity Trading Advisor) futures 1-day returns. + +## Data + +- **Features**: alpha158, hffactor +- **Labels**: Return indicators (o2c_twap1min, o2o_twap1min, etc.) +- **Normalization**: dual (blend of zscore, cs_zscore, rolling_20, rolling_60) + +## Notebooks + +| Notebook | Purpose | +|----------|---------| +| `01_data_check.ipynb` | Load and validate CTA data | +| `02_label_analysis.ipynb` | Explore label distributions and blending | +| `03_baseline_xgb.ipynb` | Train baseline XGBoost model | +| `04_blend_comparison.ipynb` | Compare different normalization blends | + +## Blend Configurations + +The label blending combines 4 normalization methods: +- **zscore**: Fit-time mean/std normalization +- **cs_zscore**: Cross-sectional z-score per datetime +- **rolling_20**: 20-day rolling window normalization +- **rolling_60**: 60-day rolling window normalization + +Predefined weights (from qshare.config.research.cta.labels): +- `equal`: [0.25, 0.25, 0.25, 0.25] +- `zscore_heavy`: [0.5, 0.2, 0.15, 0.15] +- `rolling_heavy`: [0.1, 0.1, 0.3, 0.5] +- `cs_heavy`: [0.2, 0.5, 0.15, 0.15] +- `short_term`: [0.1, 0.1, 0.4, 0.4] +- `long_term`: [0.4, 0.2, 0.2, 0.2] + +Default: [0.2, 0.1, 0.3, 0.4] diff --git a/cta_1d/src/__init__.py b/cta_1d/src/__init__.py new file mode 100644 index 0000000..615b18d --- /dev/null +++ b/cta_1d/src/__init__.py @@ -0,0 +1,5 @@ +"""CTA 1-day task-specific utilities.""" + +from .labels import get_blend_weights, describe_blend_config + +__all__ = ['get_blend_weights', 'describe_blend_config'] diff --git a/cta_1d/src/labels.py b/cta_1d/src/labels.py new file mode 100644 index 0000000..f43a44e --- /dev/null +++ b/cta_1d/src/labels.py @@ -0,0 +1,63 @@ +"""Label blending utilities for CTA experiments.""" + +from typing import Union, List + + +# Predefined blend configurations +BLEND_CONFIGS = { + 'equal': [0.25, 0.25, 0.25, 0.25], + 'zscore_heavy': [0.5, 0.2, 0.15, 0.15], + 'rolling_heavy': [0.1, 0.1, 0.3, 0.5], + 'cs_heavy': [0.2, 0.5, 0.15, 0.15], + 'short_term': [0.1, 0.1, 0.4, 0.4], + 'long_term': [0.4, 0.2, 0.2, 0.2], +} + +DEFAULT_BLEND = [0.2, 0.1, 0.3, 0.4] # [zscore, cs_zscore, roll20, roll60] + + +def get_blend_weights(weights: Union[str, List[float], None]) -> List[float]: + """Resolve blend weights from string name or list. + + Args: + weights: Config name, list of 4 floats, or None for default + + Returns: + List of 4 weights summing to 1.0 + """ + if weights is None: + return DEFAULT_BLEND + + if isinstance(weights, str): + if weights not in BLEND_CONFIGS: + raise ValueError(f"Unknown blend config: {weights}. " + f"Available: {list(BLEND_CONFIGS.keys())}") + return BLEND_CONFIGS[weights] + + if isinstance(weights, (list, tuple)): + if len(weights) != 4: + raise ValueError(f"Blend weights must have 4 values, got {len(weights)}") + if abs(sum(weights) - 1.0) > 1e-6: + raise ValueError(f"Blend weights must sum to 1.0, got {sum(weights)}") + return list(weights) + + raise ValueError(f"Invalid blend weights type: {type(weights)}") + + +def describe_blend_config(weights: Union[str, List[float]]) -> str: + """Get human-readable description of blend config. + + Args: + weights: Config name or list of weights + + Returns: + Description string + """ + names = ['zscore', 'cs_zscore', 'rolling_20', 'rolling_60'] + + if isinstance(weights, str): + w = get_blend_weights(weights) + return f"{weights}: {dict(zip(names, w))}" + + w = weights + return f"custom: {dict(zip(names, w))}" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a518866 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,22 @@ +# Alpha Lab - Experiment dependencies +# The qshare library is already installed in the virtual environment + +# Jupyter and visualization +jupyter>=7.0.0 +matplotlib>=3.7.0 +seaborn>=0.12.0 +plotly>=5.18.0 + +# Data processing +pandas>=2.0.0 +numpy>=1.24.0 +polars>=0.20.0 +pyarrow>=14.0.0 + +# Machine learning +xgboost>=2.0.0 +scikit-learn>=1.3.0 + +# Utilities +tqdm>=4.65.0 +python-dotenv>=1.0.0 diff --git a/stock_15m/01_data_exploration.ipynb b/stock_15m/01_data_exploration.ipynb new file mode 100644 index 0000000..c9726c5 --- /dev/null +++ b/stock_15m/01_data_exploration.ipynb @@ -0,0 +1,810 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Stock 15m Data Exploration\n", + "\n", + "Load and explore 15-minute return prediction data.\n", + "\n", + "**Purpose**: Understand data structure, check data quality, and visualize key statistics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import polars as pl\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from qshare.data.polars.ret15m import load_dataset, calculate_weights\n", + "from qshare.io.polars import load_from_pq\n", + "\n", + "import sys\n", + "sys.path.insert(0, '../')\n", + "from common.plotting import setup_plot_style\n", + "\n", + "setup_plot_style()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Configuration\n", + "\n", + "Define data paths and parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CONFIG = {\n", + " # Data paths (adjust as needed)\n", + " 'path_a158': '/data/parquet/stock_1min_alpha158',\n", + " 'path_kline': '/data/parquet/stock_1min',\n", + " 'path_kline_daily': '/data/parquet/stock_1day',\n", + " 'path_industry': '/data/parquet/industry_idx',\n", + " \n", + " # Date range\n", + " 'dt_range': ['2022-01-01', '2024-12-31'],\n", + " \n", + " # Normalization mode\n", + " 'normalization_mode': 'dual', # 'industry', 'cs_zscore', or 'dual'\n", + " \n", + " # Sample weights\n", + " 'positive_factor': 1.0,\n", + " 'negative_factor': 2.0,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load Raw Data\n", + "\n", + "Load data as Polars lazy frames first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load data sources\n", + "print(\"Loading data sources...\")\n", + "\n", + "pl_ldf_a158 = load_from_pq(\n", + " path=CONFIG['path_a158'],\n", + " table_alias=\"a158\",\n", + " start_time=CONFIG['dt_range'][0],\n", + " as_struct=True\n", + ")\n", + "\n", + "pl_ldf_kline = load_from_pq(\n", + " path=CONFIG['path_kline'],\n", + " table_alias=\"kline_1min\",\n", + " start_time=CONFIG['dt_range'][0],\n", + " as_struct=True\n", + ")\n", + "\n", + "pl_ldf_kline_daily = load_from_pq(\n", + " path=CONFIG['path_kline_daily'],\n", + " table_alias=\"kline_1day\",\n", + " start_time=CONFIG['dt_range'][0],\n", + ")\n", + "\n", + "pl_ldf_industry = load_from_pq(\n", + " path=CONFIG['path_industry'],\n", + " table_alias=\"indus_idx\",\n", + " start_time=CONFIG['dt_range'][0],\n", + ")\n", + "\n", + "print(\"Data sources loaded as lazy frames\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check schemas\n", + "print(\"Alpha158 schema:\")\n", + "print(pl_ldf_a158.schema)\n", + "\n", + "print(\"\\nKline 1min schema:\")\n", + "print(pl_ldf_kline.schema)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Load Training Dataset\n", + "\n", + "Use qshare's load_dataset to construct the full training data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Loading training dataset...\")\n", + "print(f\" Date range: {CONFIG['dt_range']}\")\n", + "print(f\" Normalization: {CONFIG['normalization_mode']}\")\n", + "\n", + "pl_df_train = load_dataset(\n", + " pl_ldf_a158_1min=pl_ldf_a158,\n", + " pl_ldf_kline_1min=pl_ldf_kline,\n", + " pl_ldf_kline_1day=pl_ldf_kline_daily,\n", + " pl_ldf_indus_idx=pl_ldf_industry,\n", + " dt_range=CONFIG['dt_range'],\n", + " normalization_mode=CONFIG['normalization_mode'],\n", + " negative_factor=CONFIG['negative_factor'],\n", + " positive_factor=CONFIG['positive_factor'],\n", + ")\n", + "\n", + "# Convert to pandas for easier exploration\n", + "df_train = pl_df_train.to_pandas()\n", + "\n", + "print(f\"\\nDataset shape: {df_train.shape}\")\n", + "print(f\"Columns: {len(df_train.columns)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check column types\n", + "feature_cols = [c for c in df_train.columns if c.startswith('alpha158_')]\n", + "print(f\"\\nAlpha158 features: {len(feature_cols)}\")\n", + "print(f\" Example: {feature_cols[:5]}\")\n", + "\n", + "print(f\"\\nTarget column: {[c for c in df_train.columns if 'return' in c.lower()]}\")\n", + "print(f\"Weight column: {[c for c in df_train.columns if 'weight' in c.lower()]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Data Quality Check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Missing values\n", + "missing = df_train.isnull().sum()\n", + "missing_pct = missing / len(df_train) * 100\n", + "\n", + "print(\"Missing values:\")\n", + "print(f\" Columns with missing: {(missing > 0).sum()}\")\n", + "if (missing > 0).sum() > 0:\n", + " print(\"\\nTop columns by missing %:\")\n", + " print(missing_pct[missing_pct > 0].sort_values(ascending=False).head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Data coverage by date\n", + "df_train['datetime'] = pd.to_datetime(df_train.index.get_level_values(0))\n", + "df_train['instrument'] = df_train.index.get_level_values(1)\n", + "\n", + "daily_counts = df_train.groupby('datetime')['instrument'].nunique()\n", + "\n", + "fig, ax = plt.subplots(figsize=(14, 4))\n", + "daily_counts.plot(ax=ax)\n", + "ax.set_title('Number of Instruments per Day')\n", + "ax.set_xlabel('Date')\n", + "ax.set_ylabel('Instrument Count')\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "print(f\"\\nInstruments per day: {daily_counts.mean():.0f} avg, {daily_counts.min()}-{daily_counts.max()} range\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Target Analysis\n", + "\n", + "Analyze the 15-minute return target distribution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Identify target column\n", + "target_col = [c for c in df_train.columns if 'return' in c.lower()][0]\n", + "print(f\"Target column: {target_col}\")\n", + "\n", + "# Target statistics\n", + "print(f\"\\nTarget statistics:\")\n", + "print(df_train[target_col].describe())\n", + "\n", + "print(f\"\\nSkewness: {df_train[target_col].skew():.3f}\")\n", + "print(f\"Kurtosis: {df_train[target_col].kurtosis():.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Target distribution\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", + "\n", + "# Histogram\n", + "df_train[target_col].hist(bins=100, ax=axes[0], edgecolor='black', alpha=0.7)\n", + "axes[0].set_title(f'{target_col} Distribution')\n", + "axes[0].axvline(x=0, color='red', linestyle='--')\n", + "axes[0].set_xlim(-0.05, 0.05) # Focus on main distribution\n", + "\n", + "# Time series of daily mean target\n", + "daily_mean_target = df_train.groupby('datetime')[target_col].mean()\n", + "axes[1].plot(daily_mean_target.index, daily_mean_target.values)\n", + "axes[1].set_title('Daily Mean Target')\n", + "axes[1].axhline(y=0, color='red', linestyle='--')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Feature Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature statistics\n", + "feature_stats = df_train[feature_cols].describe().T\n", + "\n", + "print(\"Feature statistics summary:\")\n", + "print(f\" Mean range: [{feature_stats['mean'].min():.4f}, {feature_stats['mean'].max():.4f}]\")\n", + "print(f\" Std range: [{feature_stats['std'].min():.4f}, {feature_stats['std'].max():.4f}]\")\n", + "\n", + "# Check for features with extreme values\n", + "extreme_features = feature_stats[\n", + " (feature_stats['mean'].abs() > 10) | (feature_stats['std'] > 100)\n", + "]\n", + "if len(extreme_features) > 0:\n", + " print(f\"\\nFeatures with extreme values: {len(extreme_features)}\")\n", + " print(extreme_features.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sample a few features for visualization\n", + "sample_features = feature_cols[:4]\n", + "\n", + "fig, axes = plt.subplots(2, 2, figsize=(12, 8))\n", + "axes = axes.flatten()\n", + "\n", + "for i, feat in enumerate(sample_features):\n", + " df_train[feat].hist(bins=100, ax=axes[i], edgecolor='black', alpha=0.7)\n", + " axes[i].set_title(feat)\n", + " axes[i].axvline(x=0, color='red', linestyle='--')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Sample Weights Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check weights if available\n", + "weight_cols = [c for c in df_train.columns if 'weight' in c.lower()]\n", + "if weight_cols:\n", + " weight_col = weight_cols[0]\n", + " print(f\"Weight column: {weight_col}\")\n", + " print(f\"\\nWeight statistics:\")\n", + " print(df_train[weight_col].describe())\n", + " \n", + " # Plot weight distribution by target sign\n", + " fig, ax = plt.subplots(figsize=(10, 4))\n", + " \n", + " positive_mask = df_train[target_col] > 0\n", + " df_train.loc[positive_mask, weight_col].hist(\n", + " bins=50, alpha=0.5, label='Positive target', ax=ax\n", + " )\n", + " df_train.loc[~positive_mask, weight_col].hist(\n", + " bins=50, alpha=0.5, label='Negative target', ax=ax\n", + " )\n", + " ax.set_title('Weight Distribution by Target Sign')\n", + " ax.legend()\n", + " plt.tight_layout()\n", + " plt.show()\n", + "else:\n", + " print(\"No weight column found\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} + + +Let me now create the baseline model notebook for stock_15m: + + +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Stock 15m Baseline Model\n", + "\n", + "Train and evaluate a baseline XGBoost model for 15-minute return prediction.\n", + "\n", + "**Purpose**: Establish baseline performance with standard configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import polars as pl\n", + "import matplotlib.pyplot as plt\n", + "import xgboost as xgb\n", + "from sklearn.metrics import r2_score\n", + "\n", + "from qshare.data.polars.ret15m import load_dataset\n", + "from qshare.io.polars import load_from_pq\n", + "\n", + "import sys\n", + "sys.path.insert(0, '../')\n", + "from common.plotting import setup_plot_style\n", + "from common.paths import create_experiment_dir\n", + "\n", + "setup_plot_style()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CONFIG = {\n", + " # Experiment\n", + " 'experiment_name': 'baseline_xgb',\n", + " 'save_results': True,\n", + " \n", + " # Data paths\n", + " 'path_a158': '/data/parquet/stock_1min_alpha158',\n", + " 'path_kline': '/data/parquet/stock_1min',\n", + " 'path_kline_daily': '/data/parquet/stock_1day',\n", + " 'path_industry': '/data/parquet/industry_idx',\n", + " \n", + " # Date ranges\n", + " 'dt_range': ['2022-01-01', '2024-12-31'],\n", + " 'train_range': ['2022-01-01', '2023-12-31'],\n", + " 'test_range': ['2024-01-01', '2024-12-31'],\n", + " \n", + " # Data config\n", + " 'normalization_mode': 'dual',\n", + " 'positive_factor': 1.0,\n", + " 'negative_factor': 2.0,\n", + " \n", + " # Model\n", + " 'model_params': {\n", + " 'objective': 'reg:squarederror',\n", + " 'eval_metric': 'rmse',\n", + " 'max_depth': 6,\n", + " 'learning_rate': 0.1,\n", + " 'n_estimators': 100,\n", + " 'subsample': 0.8,\n", + " 'colsample_bytree': 0.8,\n", + " 'random_state': 42,\n", + " },\n", + "}\n", + "\n", + "print(\"Configuration:\")\n", + "for key, value in CONFIG.items():\n", + " if not isinstance(value, dict):\n", + " print(f\" {key}: {value}\")\n", + "print(f\"\\nModel params: {CONFIG['model_params']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Loading data sources...\")\n", + "\n", + "pl_ldf_a158 = load_from_pq(\n", + " path=CONFIG['path_a158'],\n", + " table_alias=\"a158\",\n", + " start_time=CONFIG['dt_range'][0],\n", + " as_struct=True\n", + ")\n", + "\n", + "pl_ldf_kline = load_from_pq(\n", + " path=CONFIG['path_kline'],\n", + " table_alias=\"kline_1min\",\n", + " start_time=CONFIG['dt_range'][0],\n", + " as_struct=True\n", + ")\n", + "\n", + "pl_ldf_kline_daily = load_from_pq(\n", + " path=CONFIG['path_kline_daily'],\n", + " table_alias=\"kline_1day\",\n", + " start_time=CONFIG['dt_range'][0],\n", + ")\n", + "\n", + "pl_ldf_industry = load_from_pq(\n", + " path=CONFIG['path_industry'],\n", + " table_alias=\"indus_idx\",\n", + " start_time=CONFIG['dt_range'][0],\n", + ")\n", + "\n", + "print(\"Loading dataset...\")\n", + "pl_df = load_dataset(\n", + " pl_ldf_a158_1min=pl_ldf_a158,\n", + " pl_ldf_kline_1min=pl_ldf_kline,\n", + " pl_ldf_kline_1day=pl_ldf_kline_daily,\n", + " pl_ldf_indus_idx=pl_ldf_industry,\n", + " dt_range=CONFIG['dt_range'],\n", + " normalization_mode=CONFIG['normalization_mode'],\n", + " negative_factor=CONFIG['negative_factor'],\n", + " positive_factor=CONFIG['positive_factor'],\n", + ")\n", + "\n", + "# Convert to pandas\n", + "df_full = pl_df.to_pandas()\n", + "print(f\"\\nFull dataset shape: {df_full.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Prepare Train/Test Split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Identify columns\n", + "feature_cols = [c for c in df_full.columns if c.startswith('alpha158_')]\n", + "target_cols = [c for c in df_full.columns if 'return' in c.lower()]\n", + "weight_cols = [c for c in df_full.columns if 'weight' in c.lower()]\n", + "\n", + "print(f\"Features: {len(feature_cols)}\")\n", + "print(f\"Targets: {target_cols}\")\n", + "print(f\"Weights: {weight_cols}\")\n", + "\n", + "# Select target\n", + "target_col = target_cols[0]\n", + "weight_col = weight_cols[0] if weight_cols else None\n", + "\n", + "# Split by date\n", + "df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n", + "df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n", + "\n", + "print(f\"\\nTrain: {df_train.shape}\")\n", + "print(f\"Test: {df_test.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Train Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare data\n", + "X_train = df_train[feature_cols]\n", + "y_train = df_train[target_col]\n", + "w_train = df_train[weight_col] if weight_col else None\n", + "\n", + "X_test = df_test[feature_cols]\n", + "y_test = df_test[target_col]\n", + "\n", + "# Handle missing values\n", + "X_train = X_train.fillna(X_train.median())\n", + "X_test = X_test.fillna(X_train.median()) # Use train median\n", + "\n", + "print(\"Training XGBoost model...\")\n", + "print(f\" X shape: {X_train.shape}\")\n", + "print(f\" y mean: {y_train.mean():.6f}, std: {y_train.std():.6f}\")\n", + "\n", + "model = xgb.XGBRegressor(**CONFIG['model_params'])\n", + "\n", + "model.fit(\n", + " X_train, y_train,\n", + " sample_weight=w_train,\n", + " eval_set=[(X_test, y_test)],\n", + " verbose=False\n", + ")\n", + "\n", + "print(\"Training complete!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature importance\n", + "importance = pd.DataFrame({\n", + " 'feature': feature_cols,\n", + " 'importance': model.feature_importances_\n", + "}).sort_values('importance', ascending=False)\n", + "\n", + "print(\"\\nTop 10 Features:\")\n", + "print(importance.head(10))\n", + "\n", + "# Plot\n", + "fig, ax = plt.subplots(figsize=(10, 6))\n", + "importance.head(20).plot(x='feature', y='importance', kind='barh', ax=ax)\n", + "ax.set_title('Top 20 Feature Importance')\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Evaluate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate predictions\n", + "y_pred_train = model.predict(X_train)\n", + "y_pred_test = model.predict(X_test)\n", + "\n", + "# Calculate metrics\n", + "train_r2 = r2_score(y_train, y_pred_train)\n", + "test_r2 = r2_score(y_test, y_pred_test)\n", + "\n", + "# IC (Information Coefficient)\n", + "train_ic = np.corrcoef(y_train, y_pred_train)[0, 1]\n", + "test_ic = np.corrcoef(y_test, y_pred_test)[0, 1]\n", + "\n", + "print(\"Performance Metrics:\")\n", + "print(f\" Train R2: {train_r2:.4f}\")\n", + "print(f\" Test R2: {test_r2:.4f}\")\n", + "print(f\" Train IC: {train_ic:.4f}\")\n", + "print(f\" Test IC: {test_ic:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Daily IC analysis\n", + "df_test_eval = df_test.copy()\n", + "df_test_eval['pred'] = y_pred_test\n", + "df_test_eval['target'] = y_test\n", + "\n", + "df_test_eval['datetime'] = df_test_eval.index.get_level_values(0)\n", + "\n", + "# Calculate daily IC\n", + "daily_ic = df_test_eval.groupby('datetime').apply(\n", + " lambda x: x['target'].corr(x['pred'])\n", + ")\n", + "\n", + "print(\"\\nDaily IC Statistics:\")\n", + "print(f\" Mean: {daily_ic.mean():.4f}\")\n", + "print(f\" Std: {daily_ic.std():.4f}\")\n", + "print(f\" IR: {daily_ic.mean() / daily_ic.std():.4f}\")\n", + "print(f\" >0: {(daily_ic > 0).mean():.1%}\")\n", + "\n", + "# Plot\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", + "\n", + "# IC distribution\n", + "daily_ic.hist(bins=50, ax=axes[0], edgecolor='black')\n", + "axes[0].axvline(x=0, color='red', linestyle='--')\n", + "axes[0].axvline(x=daily_ic.mean(), color='green', linestyle='--', label=f'Mean: {daily_ic.mean():.3f}')\n", + "axes[0].set_title('Daily IC Distribution')\n", + "axes[0].legend()\n", + "\n", + "# IC time series\n", + "daily_ic.rolling(20, min_periods=5).mean().plot(ax=axes[1])\n", + "axes[1].axhline(y=0, color='red', linestyle='--')\n", + "axes[1].set_title('Rolling IC (20-day)')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prediction vs Actual scatter\n", + "fig, ax = plt.subplots(figsize=(8, 8))\n", + "\n", + "# Sample for plotting\n", + "sample_idx = np.random.choice(len(y_test), size=min(10000, len(y_test)), replace=False)\n", + "ax.scatter(y_test.iloc[sample_idx], y_pred_test[sample_idx], alpha=0.3, s=1)\n", + "\n", + "# Perfect prediction line\n", + "lims = [min(y_test.min(), y_pred_test.min()), max(y_test.max(), y_pred_test.max())]\n", + "ax.plot(lims, lims, 'r--', alpha=0.5)\n", + "\n", + "ax.set_xlabel('Actual')\n", + "ax.set_ylabel('Predicted')\n", + "ax.set_title(f'Prediction vs Actual (IC={test_ic:.3f})')\n", + "ax.grid(True, alpha=0.3)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Save Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if CONFIG['save_results']:\n", + " import pickle\n", + " import json\n", + " \n", + " output_dir = create_experiment_dir('stock_15m', CONFIG['experiment_name'])\n", + " print(f\"Saving results to: {output_dir}\")\n", + " \n", + " # Save config\n", + " with open(output_dir / 'config.json', 'w') as f:\n", + " json.dump(CONFIG, f, indent=2, default=str)\n", + " \n", + " # Save model\n", + " with open(output_dir / 'model.pkl', 'wb') as f:\n", + " pickle.dump(model, f)\n", + " \n", + " # Save importance\n", + " importance.to_csv(output_dir / 'feature_importance.csv', index=False)\n", + " \n", + " # Save predictions\n", + " predictions = pd.DataFrame({\n", + " 'actual': y_test,\n", + " 'predicted': y_pred_test\n", + " }, index=df_test.index)\n", + " predictions.to_csv(output_dir / 'predictions.csv')\n", + " \n", + " # Save metrics\n", + " metrics = {\n", + " 'train_r2': float(train_r2),\n", + " 'test_r2': float(test_r2),\n", + " 'train_ic': float(train_ic),\n", + " 'test_ic': float(test_ic),\n", + " 'daily_ic_mean': float(daily_ic.mean()),\n", + " 'daily_ic_std': float(daily_ic.std()),\n", + " 'daily_ir': float(daily_ic.mean() / daily_ic.std()),\n", + " }\n", + " with open(output_dir / 'metrics.json', 'w') as f:\n", + " json.dump(metrics, f, indent=2)\n", + " \n", + " print(\"\\nFiles saved:\")\n", + " for f in output_dir.iterdir():\n", + " print(f\" - {f.name}\")\n", + "else:\n", + " print(\"Results not saved\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/stock_15m/02_baseline_model.ipynb b/stock_15m/02_baseline_model.ipynb new file mode 100644 index 0000000..6fbf850 --- /dev/null +++ b/stock_15m/02_baseline_model.ipynb @@ -0,0 +1,257 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Stock 15m Baseline Model\n", + "\n", + "Train and evaluate a baseline XGBoost model for 15-minute return prediction.\n", + "\n", + "**Purpose**: Establish baseline performance with standard configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import polars as pl\n", + "import matplotlib.pyplot as plt\n", + "import xgboost as xgb\n", + "from sklearn.metrics import r2_score\n", + "\n", + "from qshare.data.polars.ret15m import load_dataset\n", + "from qshare.io.polars import load_from_pq\n", + "\n", + "import sys\n", + "sys.path.insert(0, '../')\n", + "from common.plotting import setup_plot_style\n", + "from common.paths import create_experiment_dir\n", + "\n", + "setup_plot_style()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CONFIG = {\n", + " 'experiment_name': 'baseline_xgb',\n", + " 'save_results': True,\n", + " 'path_a158': '/data/parquet/stock_1min_alpha158',\n", + " 'path_kline': '/data/parquet/stock_1min',\n", + " 'path_kline_daily': '/data/parquet/stock_1day',\n", + " 'path_industry': '/data/parquet/industry_idx',\n", + " 'dt_range': ['2022-01-01', '2024-12-31'],\n", + " 'train_range': ['2022-01-01', '2023-12-31'],\n", + " 'test_range': ['2024-01-01', '2024-12-31'],\n", + " 'normalization_mode': 'dual',\n", + " 'positive_factor': 1.0,\n", + " 'negative_factor': 2.0,\n", + " 'model_params': {\n", + " 'objective': 'reg:squarederror',\n", + " 'eval_metric': 'rmse',\n", + " 'max_depth': 6,\n", + " 'learning_rate': 0.1,\n", + " 'n_estimators': 100,\n", + " 'subsample': 0.8,\n", + " 'colsample_bytree': 0.8,\n", + " 'random_state': 42,\n", + " },\n", + "}\n", + "\n", + "print('Configuration:')\n", + "for key, value in CONFIG.items():\n", + " if not isinstance(value, dict):\n", + " print(f' {key}: {value}')\n", + "print(f\"Model params: {CONFIG['model_params']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('Loading data sources...')\n", + "\n", + "pl_ldf_a158 = load_from_pq(\n", + " path=CONFIG['path_a158'],\n", + " table_alias='a158',\n", + " start_time=CONFIG['dt_range'][0],\n", + " as_struct=True\n", + ")\n", + "\n", + "pl_ldf_kline = load_from_pq(\n", + " path=CONFIG['path_kline'],\n", + " table_alias='kline_1min',\n", + " start_time=CONFIG['dt_range'][0],\n", + " as_struct=True\n", + ")\n", + "\n", + "pl_ldf_kline_daily = load_from_pq(\n", + " path=CONFIG['path_kline_daily'],\n", + " table_alias='kline_1day',\n", + " start_time=CONFIG['dt_range'][0],\n", + ")\n", + "\n", + "pl_ldf_industry = load_from_pq(\n", + " path=CONFIG['path_industry'],\n", + " table_alias='indus_idx',\n", + " start_time=CONFIG['dt_range'][0],\n", + ")\n", + "\n", + "print('Loading dataset...')\n", + "pl_df = load_dataset(\n", + " pl_ldf_a158_1min=pl_ldf_a158,\n", + " pl_ldf_kline_1min=pl_ldf_kline,\n", + " pl_ldf_kline_1day=pl_ldf_kline_daily,\n", + " pl_ldf_indus_idx=pl_ldf_industry,\n", + " dt_range=CONFIG['dt_range'],\n", + " normalization_mode=CONFIG['normalization_mode'],\n", + " negative_factor=CONFIG['negative_factor'],\n", + " positive_factor=CONFIG['positive_factor'],\n", + ")\n", + "\n", + "df_full = pl_df.to_pandas()\n", + "print(f'Full dataset shape: {df_full.shape}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Train/Test Split and Model Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_cols = [c for c in df_full.columns if c.startswith('alpha158_')]\n", + "target_cols = [c for c in df_full.columns if 'return' in c.lower()]\n", + "weight_cols = [c for c in df_full.columns if 'weight' in c.lower()]\n", + "\n", + "target_col = target_cols[0]\n", + "weight_col = weight_cols[0] if weight_cols else None\n", + "\n", + "df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n", + "df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n", + "\n", + "print(f'Train: {df_train.shape}, Test: {df_test.shape}')\n", + "print(f'Features: {len(feature_cols)}, Target: {target_col}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = df_train[feature_cols].fillna(df_train[feature_cols].median())\n", + "y_train = df_train[target_col]\n", + "w_train = df_train[weight_col] if weight_col else None\n", + "\n", + "X_test = df_test[feature_cols].fillna(df_train[feature_cols].median())\n", + "y_test = df_test[target_col]\n", + "\n", + "print('Training XGBoost...')\n", + "model = xgb.XGBRegressor(**CONFIG['model_params'])\n", + "model.fit(X_train, y_train, sample_weight=w_train, verbose=False)\n", + "print('Training complete!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred_test = model.predict(X_test)\n", + "\n", + "test_r2 = r2_score(y_test, y_pred_test)\n", + "test_ic = np.corrcoef(y_test, y_pred_test)[0, 1]\n", + "\n", + "print(f'Test R2: {test_r2:.4f}')\n", + "print(f'Test IC: {test_ic:.4f}')\n", + "\n", + "# Daily IC\n", + "df_test_eval = df_test.copy()\n", + "df_test_eval['pred'] = y_pred_test\n", + "df_test_eval['target'] = y_test\n", + "df_test_eval['datetime'] = df_test_eval.index.get_level_values(0)\n", + "\n", + "daily_ic = df_test_eval.groupby('datetime').apply(\n", + " lambda x: x['target'].corr(x['pred'])\n", + ")\n", + "\n", + "print(f'Daily IC Mean: {daily_ic.mean():.4f}')\n", + "print(f'Daily IC Std: {daily_ic.std():.4f}')\n", + "print(f'IR: {daily_ic.mean() / daily_ic.std():.4f}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot daily IC\n", + "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", + "\n", + "daily_ic.hist(bins=50, ax=axes[0], edgecolor='black')\n", + "axes[0].axvline(x=daily_ic.mean(), color='green', linestyle='--')\n", + "axes[0].set_title('Daily IC Distribution')\n", + "\n", + "daily_ic.rolling(20, min_periods=5).mean().plot(ax=axes[1])\n", + "axes[1].axhline(y=0, color='red', linestyle='--')\n", + "axes[1].set_title('Rolling IC (20-day)')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/stock_15m/README.md b/stock_15m/README.md new file mode 100644 index 0000000..907eb83 --- /dev/null +++ b/stock_15m/README.md @@ -0,0 +1,25 @@ +# Stock 15-Minute Return Prediction + +Experiments for predicting stock 15-minute returns using high-frequency features. + +## Data + +- **Features**: alpha158 computed on 1-minute data +- **Target**: 15-minute forward returns (close[t+16]/close[t+1]-1) +- **Normalization**: industry, cs_zscore, or dual + +## Notebooks + +| Notebook | Purpose | +|----------|---------| +| `01_data_exploration.ipynb` | Load and explore 15m data structure | +| `02_baseline_model.ipynb` | Train baseline XGBoost model | + +## Methodology + +1. Load 1-minute kline data via Polars lazy frames +2. Compute/retrieve alpha158 features +3. Calculate 15-minute forward returns +4. Apply normalization (industry-neutralized or cross-sectional z-score) +5. Train gradient boosting models +6. Evaluate with IC and backtest diff --git a/stock_15m/src/__init__.py b/stock_15m/src/__init__.py new file mode 100644 index 0000000..7177588 --- /dev/null +++ b/stock_15m/src/__init__.py @@ -0,0 +1,3 @@ +"""Stock 15m task-specific utilities.""" + +# Add task-specific functions here as needed