{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# CTA 1D Baseline XGBoost Model\n", "\n", "Train and evaluate a baseline XGBoost model for CTA 1-day return prediction.\n", "\n", "**Purpose**: Establish a baseline performance benchmark with standard configuration." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import json\n", "from datetime import datetime\n", "\n", "from qshare.data.pandas.cta_1d import load_dataset\n", "from qshare.algo.learning.cta_trainer import CTAXGBTrainer\n", "from qshare.eval.cta.backtest import CTABacktester\n", "\n", "import sys\n", "sys.path.insert(0, '../')\n", "from common.plotting import setup_plot_style, plot_ic_series, plot_cumulative_returns\n", "from common.paths import create_experiment_dir\n", "from src.labels import get_blend_weights, describe_blend_config\n", "\n", "setup_plot_style()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Configuration\n", "\n", "Edit this cell to modify experiment parameters." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "CONFIG = {\n", " # Experiment\n", " 'experiment_name': 'baseline_xgb', # Will be appended with timestamp\n", " \n", " # Date ranges\n", " 'dt_range': ['2020-01-01', '2024-12-31'],\n", " 'train_range': ['2020-01-01', '2022-12-31'],\n", " 'test_range': ['2023-01-01', '2024-12-31'],\n", " 'fit_range': ['2020-01-01', '2021-06-30'], # For normalization fitting\n", " \n", " # Data\n", " 'feature_sets': ['alpha158', 'hffactor'],\n", " 'return_type': 'o2c_twap1min',\n", " 'normalization': 'dual',\n", " 'blend_weights': None, # Use default [0.2, 0.1, 0.3, 0.4] or specify name/list\n", " 'weight_factors': {'positive': 1.0, 'negative': 2.0},\n", " \n", " # Model\n", " 'xgb_params': {\n", " 'booster': 'gblinear',\n", " 'eta': 0.5,\n", " 'lambda_reg': 0.1,\n", " 'num_round': 20,\n", " },\n", " \n", " # Backtest\n", " 'backtest_params': {\n", " 'num_trades': 4,\n", " 'signal_dist': 'normal',\n", " 'pos_weight': True,\n", " },\n", " \n", " # Output\n", " 'save_results': True,\n", "}\n", "\n", "print(\"Configuration:\")\n", "print(f\" Experiment: {CONFIG['experiment_name']}\")\n", "print(f\" Train: {CONFIG['train_range'][0]} to {CONFIG['train_range'][1]}\")\n", "print(f\" Test: {CONFIG['test_range'][0]} to {CONFIG['test_range'][1]}\")\n", "print(f\" Blend: {describe_blend_config(CONFIG['blend_weights'] or 'default')}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Load Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Loading dataset...\")\n", "df_full = load_dataset(\n", " dt_range=CONFIG['dt_range'],\n", " return_type=CONFIG['return_type'],\n", " normalization=CONFIG['normalization'],\n", " feature_sets=CONFIG['feature_sets'],\n", " fit_range=CONFIG['fit_range'],\n", " weight_factors=CONFIG['weight_factors'],\n", " blend_weights=CONFIG['blend_weights'],\n", ")\n", "\n", "print(f\"\\nDataset shape: {df_full.shape}\")\n", "print(f\"Columns: {len(df_full.columns)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Split train/test\n", "df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n", "df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n", "\n", "print(f\"Train: {df_train.shape}\")\n", "print(f\"Test: {df_test.shape}\")\n", "\n", "# Get feature columns\n", "feature_cols = [c for c in df_train.columns\n", " if c.startswith(('alpha158_', 'hf_', 'f_'))]\n", "print(f\"\\nFeatures: {len(feature_cols)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Train Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Training XGBoost model...\")\n", "print(f\" Params: {CONFIG['xgb_params']}\")\n", "\n", "trainer = CTAXGBTrainer(**CONFIG['xgb_params'])\n", "\n", "trainer.fit(\n", " df_train,\n", " feature_cols=feature_cols,\n", " target_col='label',\n", " weight_col='weight'\n", ")\n", "\n", "print(\"\\nTraining complete!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Feature importance\n", "importance = trainer.get_feature_importance()\n", "print(\"\\nTop 10 Features:\")\n", "print(importance.head(10))\n", "\n", "# Plot\n", "fig, ax = plt.subplots(figsize=(10, 6))\n", "importance.head(20).plot(kind='barh', ax=ax)\n", "ax.set_title('Top 20 Feature Importance')\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Generate Predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Generating predictions on test set...\")\n", "df_signal = trainer.predict(df_test)\n", "\n", "print(f\"\\nSignal statistics:\")\n", "print(df_signal.describe())\n", "\n", "# Plot signal distribution\n", "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", "\n", "df_signal.hist(bins=100, ax=axes[0], edgecolor='black')\n", "axes[0].set_title('Signal Distribution')\n", "axes[0].axvline(x=0, color='red', linestyle='--')\n", "\n", "signal_by_date = df_signal.groupby(level=0).mean()\n", "axes[1].plot(signal_by_date.index, signal_by_date.values)\n", "axes[1].set_title('Mean Signal by Date')\n", "axes[1].axhline(y=0, color='red', linestyle='--')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Evaluate with Backtest" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Running backtest...\")\n", "\n", "returns = df_test['return'] if 'return' in df_test.columns else df_test['label']\n", "\n", "backtester = CTABacktester(**CONFIG['backtest_params'])\n", "results = backtester.run(returns, df_signal)\n", "\n", "summary = backtester.summary()\n", "print(\"\\nBacktest Summary:\")\n", "for key, value in summary.items():\n", " if isinstance(value, float):\n", " print(f\" {key}: {value:.4f}\")\n", " else:\n", " print(f\" {key}: {value}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# IC Analysis\n", "ic_by_date = results.groupby(results.index.get_level_values(0))['ic'].mean()\n", "\n", "fig = plot_ic_series(ic_by_date, title=\"IC Over Time (Test Set)\")\n", "plt.show()\n", "\n", "print(f\"\\nIC Statistics:\")\n", "print(f\" Mean: {ic_by_date.mean():.4f}\")\n", "print(f\" Std: {ic_by_date.std():.4f}\")\n", "print(f\" IR: {ic_by_date.mean() / ic_by_date.std():.4f}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Cumulative returns\n", "daily_returns = results.groupby(results.index.get_level_values(0))['pos_return'].mean()\n", "\n", "fig = plot_cumulative_returns(daily_returns, title=\"Cumulative Strategy Returns\")\n", "plt.show()\n", "\n", "total_return = (1 + daily_returns).prod() - 1\n", "annual_return = (1 + total_return) ** (252 / len(daily_returns)) - 1\n", "sharpe = daily_returns.mean() / daily_returns.std() * np.sqrt(252)\n", "\n", "print(f\"\\nReturn Statistics:\")\n", "print(f\" Total Return: {total_return:.2%}\")\n", "print(f\" Annual Return: {annual_return:.2%}\")\n", "print(f\" Sharpe Ratio: {sharpe:.2f}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Save Results\n", "\n", "Save model, predictions, and metrics for later analysis." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if CONFIG['save_results']:\n", " # Create output directory\n", " output_dir = create_experiment_dir('cta_1d', CONFIG['experiment_name'])\n", " print(f\"Saving results to: {output_dir}\")\n", " \n", " # Save config\n", " with open(output_dir / 'config.json', 'w') as f:\n", " json.dump(CONFIG, f, indent=2, default=str)\n", " \n", " # Save model\n", " trainer.save_model(str(output_dir / 'model.pkl'))\n", " \n", " # Save feature importance\n", " importance.to_csv(output_dir / 'feature_importance.csv')\n", " \n", " # Save predictions\n", " df_signal.to_csv(output_dir / 'predictions.csv')\n", " \n", " # Save backtest results\n", " results.to_csv(output_dir / 'backtest_results.csv')\n", " \n", " # Save summary\n", " with open(output_dir / 'summary.json', 'w') as f:\n", " json.dump(summary, f, indent=2, default=str)\n", " \n", " print(\"\\nFiles saved:\")\n", " for f in output_dir.iterdir():\n", " print(f\" - {f.name}\")\n", "else:\n", " print(\"Results not saved (save_results=False)\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 }