You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
810 lines
23 KiB
810 lines
23 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Stock 15m Data Exploration\n",
|
|
"\n",
|
|
"Load and explore 15-minute return prediction data.\n",
|
|
"\n",
|
|
"**Purpose**: Understand data structure, check data quality, and visualize key statistics."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import polars as pl\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"from qshare.data.polars.ret15m import load_dataset, calculate_weights\n",
|
|
"from qshare.io.polars import load_from_pq\n",
|
|
"\n",
|
|
"import sys\n",
|
|
"sys.path.insert(0, '../')\n",
|
|
"from common.plotting import setup_plot_style\n",
|
|
"\n",
|
|
"setup_plot_style()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 1. Configuration\n",
|
|
"\n",
|
|
"Define data paths and parameters."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"CONFIG = {\n",
|
|
" # Data paths (adjust as needed)\n",
|
|
" 'path_a158': '/data/parquet/stock_1min_alpha158',\n",
|
|
" 'path_kline': '/data/parquet/stock_1min',\n",
|
|
" 'path_kline_daily': '/data/parquet/stock_1day',\n",
|
|
" 'path_industry': '/data/parquet/industry_idx',\n",
|
|
" \n",
|
|
" # Date range\n",
|
|
" 'dt_range': ['2022-01-01', '2024-12-31'],\n",
|
|
" \n",
|
|
" # Normalization mode\n",
|
|
" 'normalization_mode': 'dual', # 'industry', 'cs_zscore', or 'dual'\n",
|
|
" \n",
|
|
" # Sample weights\n",
|
|
" 'positive_factor': 1.0,\n",
|
|
" 'negative_factor': 2.0,\n",
|
|
"}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Load Raw Data\n",
|
|
"\n",
|
|
"Load data as Polars lazy frames first."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load data sources\n",
|
|
"print(\"Loading data sources...\")\n",
|
|
"\n",
|
|
"pl_ldf_a158 = load_from_pq(\n",
|
|
" path=CONFIG['path_a158'],\n",
|
|
" table_alias=\"a158\",\n",
|
|
" start_time=CONFIG['dt_range'][0],\n",
|
|
" as_struct=True\n",
|
|
")\n",
|
|
"\n",
|
|
"pl_ldf_kline = load_from_pq(\n",
|
|
" path=CONFIG['path_kline'],\n",
|
|
" table_alias=\"kline_1min\",\n",
|
|
" start_time=CONFIG['dt_range'][0],\n",
|
|
" as_struct=True\n",
|
|
")\n",
|
|
"\n",
|
|
"pl_ldf_kline_daily = load_from_pq(\n",
|
|
" path=CONFIG['path_kline_daily'],\n",
|
|
" table_alias=\"kline_1day\",\n",
|
|
" start_time=CONFIG['dt_range'][0],\n",
|
|
")\n",
|
|
"\n",
|
|
"pl_ldf_industry = load_from_pq(\n",
|
|
" path=CONFIG['path_industry'],\n",
|
|
" table_alias=\"indus_idx\",\n",
|
|
" start_time=CONFIG['dt_range'][0],\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Data sources loaded as lazy frames\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Check schemas\n",
|
|
"print(\"Alpha158 schema:\")\n",
|
|
"print(pl_ldf_a158.schema)\n",
|
|
"\n",
|
|
"print(\"\\nKline 1min schema:\")\n",
|
|
"print(pl_ldf_kline.schema)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Load Training Dataset\n",
|
|
"\n",
|
|
"Use qshare's load_dataset to construct the full training data."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(\"Loading training dataset...\")\n",
|
|
"print(f\" Date range: {CONFIG['dt_range']}\")\n",
|
|
"print(f\" Normalization: {CONFIG['normalization_mode']}\")\n",
|
|
"\n",
|
|
"pl_df_train = load_dataset(\n",
|
|
" pl_ldf_a158_1min=pl_ldf_a158,\n",
|
|
" pl_ldf_kline_1min=pl_ldf_kline,\n",
|
|
" pl_ldf_kline_1day=pl_ldf_kline_daily,\n",
|
|
" pl_ldf_indus_idx=pl_ldf_industry,\n",
|
|
" dt_range=CONFIG['dt_range'],\n",
|
|
" normalization_mode=CONFIG['normalization_mode'],\n",
|
|
" negative_factor=CONFIG['negative_factor'],\n",
|
|
" positive_factor=CONFIG['positive_factor'],\n",
|
|
")\n",
|
|
"\n",
|
|
"# Convert to pandas for easier exploration\n",
|
|
"df_train = pl_df_train.to_pandas()\n",
|
|
"\n",
|
|
"print(f\"\\nDataset shape: {df_train.shape}\")\n",
|
|
"print(f\"Columns: {len(df_train.columns)}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Check column types\n",
|
|
"feature_cols = [c for c in df_train.columns if c.startswith('alpha158_')]\n",
|
|
"print(f\"\\nAlpha158 features: {len(feature_cols)}\")\n",
|
|
"print(f\" Example: {feature_cols[:5]}\")\n",
|
|
"\n",
|
|
"print(f\"\\nTarget column: {[c for c in df_train.columns if 'return' in c.lower()]}\")\n",
|
|
"print(f\"Weight column: {[c for c in df_train.columns if 'weight' in c.lower()]}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 4. Data Quality Check"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Missing values\n",
|
|
"missing = df_train.isnull().sum()\n",
|
|
"missing_pct = missing / len(df_train) * 100\n",
|
|
"\n",
|
|
"print(\"Missing values:\")\n",
|
|
"print(f\" Columns with missing: {(missing > 0).sum()}\")\n",
|
|
"if (missing > 0).sum() > 0:\n",
|
|
" print(\"\\nTop columns by missing %:\")\n",
|
|
" print(missing_pct[missing_pct > 0].sort_values(ascending=False).head(10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Data coverage by date\n",
|
|
"df_train['datetime'] = pd.to_datetime(df_train.index.get_level_values(0))\n",
|
|
"df_train['instrument'] = df_train.index.get_level_values(1)\n",
|
|
"\n",
|
|
"daily_counts = df_train.groupby('datetime')['instrument'].nunique()\n",
|
|
"\n",
|
|
"fig, ax = plt.subplots(figsize=(14, 4))\n",
|
|
"daily_counts.plot(ax=ax)\n",
|
|
"ax.set_title('Number of Instruments per Day')\n",
|
|
"ax.set_xlabel('Date')\n",
|
|
"ax.set_ylabel('Instrument Count')\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"print(f\"\\nInstruments per day: {daily_counts.mean():.0f} avg, {daily_counts.min()}-{daily_counts.max()} range\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 5. Target Analysis\n",
|
|
"\n",
|
|
"Analyze the 15-minute return target distribution."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Identify target column\n",
|
|
"target_col = [c for c in df_train.columns if 'return' in c.lower()][0]\n",
|
|
"print(f\"Target column: {target_col}\")\n",
|
|
"\n",
|
|
"# Target statistics\n",
|
|
"print(f\"\\nTarget statistics:\")\n",
|
|
"print(df_train[target_col].describe())\n",
|
|
"\n",
|
|
"print(f\"\\nSkewness: {df_train[target_col].skew():.3f}\")\n",
|
|
"print(f\"Kurtosis: {df_train[target_col].kurtosis():.3f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Target distribution\n",
|
|
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
|
|
"\n",
|
|
"# Histogram\n",
|
|
"df_train[target_col].hist(bins=100, ax=axes[0], edgecolor='black', alpha=0.7)\n",
|
|
"axes[0].set_title(f'{target_col} Distribution')\n",
|
|
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
|
|
"axes[0].set_xlim(-0.05, 0.05) # Focus on main distribution\n",
|
|
"\n",
|
|
"# Time series of daily mean target\n",
|
|
"daily_mean_target = df_train.groupby('datetime')[target_col].mean()\n",
|
|
"axes[1].plot(daily_mean_target.index, daily_mean_target.values)\n",
|
|
"axes[1].set_title('Daily Mean Target')\n",
|
|
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 6. Feature Analysis"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Feature statistics\n",
|
|
"feature_stats = df_train[feature_cols].describe().T\n",
|
|
"\n",
|
|
"print(\"Feature statistics summary:\")\n",
|
|
"print(f\" Mean range: [{feature_stats['mean'].min():.4f}, {feature_stats['mean'].max():.4f}]\")\n",
|
|
"print(f\" Std range: [{feature_stats['std'].min():.4f}, {feature_stats['std'].max():.4f}]\")\n",
|
|
"\n",
|
|
"# Check for features with extreme values\n",
|
|
"extreme_features = feature_stats[\n",
|
|
" (feature_stats['mean'].abs() > 10) | (feature_stats['std'] > 100)\n",
|
|
"]\n",
|
|
"if len(extreme_features) > 0:\n",
|
|
" print(f\"\\nFeatures with extreme values: {len(extreme_features)}\")\n",
|
|
" print(extreme_features.head())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Sample a few features for visualization\n",
|
|
"sample_features = feature_cols[:4]\n",
|
|
"\n",
|
|
"fig, axes = plt.subplots(2, 2, figsize=(12, 8))\n",
|
|
"axes = axes.flatten()\n",
|
|
"\n",
|
|
"for i, feat in enumerate(sample_features):\n",
|
|
" df_train[feat].hist(bins=100, ax=axes[i], edgecolor='black', alpha=0.7)\n",
|
|
" axes[i].set_title(feat)\n",
|
|
" axes[i].axvline(x=0, color='red', linestyle='--')\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 7. Sample Weights Analysis"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Check weights if available\n",
|
|
"weight_cols = [c for c in df_train.columns if 'weight' in c.lower()]\n",
|
|
"if weight_cols:\n",
|
|
" weight_col = weight_cols[0]\n",
|
|
" print(f\"Weight column: {weight_col}\")\n",
|
|
" print(f\"\\nWeight statistics:\")\n",
|
|
" print(df_train[weight_col].describe())\n",
|
|
" \n",
|
|
" # Plot weight distribution by target sign\n",
|
|
" fig, ax = plt.subplots(figsize=(10, 4))\n",
|
|
" \n",
|
|
" positive_mask = df_train[target_col] > 0\n",
|
|
" df_train.loc[positive_mask, weight_col].hist(\n",
|
|
" bins=50, alpha=0.5, label='Positive target', ax=ax\n",
|
|
" )\n",
|
|
" df_train.loc[~positive_mask, weight_col].hist(\n",
|
|
" bins=50, alpha=0.5, label='Negative target', ax=ax\n",
|
|
" )\n",
|
|
" ax.set_title('Weight Distribution by Target Sign')\n",
|
|
" ax.legend()\n",
|
|
" plt.tight_layout()\n",
|
|
" plt.show()\n",
|
|
"else:\n",
|
|
" print(\"No weight column found\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python",
|
|
"version": "3.8.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|
|
</thinking>
|
|
|
|
Let me now create the baseline model notebook for stock_15m:
|
|
|
|
<function=Write>
|
|
<parameter=content>{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Stock 15m Baseline Model\n",
|
|
"\n",
|
|
"Train and evaluate a baseline XGBoost model for 15-minute return prediction.\n",
|
|
"\n",
|
|
"**Purpose**: Establish baseline performance with standard configuration."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import polars as pl\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import xgboost as xgb\n",
|
|
"from sklearn.metrics import r2_score\n",
|
|
"\n",
|
|
"from qshare.data.polars.ret15m import load_dataset\n",
|
|
"from qshare.io.polars import load_from_pq\n",
|
|
"\n",
|
|
"import sys\n",
|
|
"sys.path.insert(0, '../')\n",
|
|
"from common.plotting import setup_plot_style\n",
|
|
"from common.paths import create_experiment_dir\n",
|
|
"\n",
|
|
"setup_plot_style()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 1. Configuration"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"CONFIG = {\n",
|
|
" # Experiment\n",
|
|
" 'experiment_name': 'baseline_xgb',\n",
|
|
" 'save_results': True,\n",
|
|
" \n",
|
|
" # Data paths\n",
|
|
" 'path_a158': '/data/parquet/stock_1min_alpha158',\n",
|
|
" 'path_kline': '/data/parquet/stock_1min',\n",
|
|
" 'path_kline_daily': '/data/parquet/stock_1day',\n",
|
|
" 'path_industry': '/data/parquet/industry_idx',\n",
|
|
" \n",
|
|
" # Date ranges\n",
|
|
" 'dt_range': ['2022-01-01', '2024-12-31'],\n",
|
|
" 'train_range': ['2022-01-01', '2023-12-31'],\n",
|
|
" 'test_range': ['2024-01-01', '2024-12-31'],\n",
|
|
" \n",
|
|
" # Data config\n",
|
|
" 'normalization_mode': 'dual',\n",
|
|
" 'positive_factor': 1.0,\n",
|
|
" 'negative_factor': 2.0,\n",
|
|
" \n",
|
|
" # Model\n",
|
|
" 'model_params': {\n",
|
|
" 'objective': 'reg:squarederror',\n",
|
|
" 'eval_metric': 'rmse',\n",
|
|
" 'max_depth': 6,\n",
|
|
" 'learning_rate': 0.1,\n",
|
|
" 'n_estimators': 100,\n",
|
|
" 'subsample': 0.8,\n",
|
|
" 'colsample_bytree': 0.8,\n",
|
|
" 'random_state': 42,\n",
|
|
" },\n",
|
|
"}\n",
|
|
"\n",
|
|
"print(\"Configuration:\")\n",
|
|
"for key, value in CONFIG.items():\n",
|
|
" if not isinstance(value, dict):\n",
|
|
" print(f\" {key}: {value}\")\n",
|
|
"print(f\"\\nModel params: {CONFIG['model_params']}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Load Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(\"Loading data sources...\")\n",
|
|
"\n",
|
|
"pl_ldf_a158 = load_from_pq(\n",
|
|
" path=CONFIG['path_a158'],\n",
|
|
" table_alias=\"a158\",\n",
|
|
" start_time=CONFIG['dt_range'][0],\n",
|
|
" as_struct=True\n",
|
|
")\n",
|
|
"\n",
|
|
"pl_ldf_kline = load_from_pq(\n",
|
|
" path=CONFIG['path_kline'],\n",
|
|
" table_alias=\"kline_1min\",\n",
|
|
" start_time=CONFIG['dt_range'][0],\n",
|
|
" as_struct=True\n",
|
|
")\n",
|
|
"\n",
|
|
"pl_ldf_kline_daily = load_from_pq(\n",
|
|
" path=CONFIG['path_kline_daily'],\n",
|
|
" table_alias=\"kline_1day\",\n",
|
|
" start_time=CONFIG['dt_range'][0],\n",
|
|
")\n",
|
|
"\n",
|
|
"pl_ldf_industry = load_from_pq(\n",
|
|
" path=CONFIG['path_industry'],\n",
|
|
" table_alias=\"indus_idx\",\n",
|
|
" start_time=CONFIG['dt_range'][0],\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Loading dataset...\")\n",
|
|
"pl_df = load_dataset(\n",
|
|
" pl_ldf_a158_1min=pl_ldf_a158,\n",
|
|
" pl_ldf_kline_1min=pl_ldf_kline,\n",
|
|
" pl_ldf_kline_1day=pl_ldf_kline_daily,\n",
|
|
" pl_ldf_indus_idx=pl_ldf_industry,\n",
|
|
" dt_range=CONFIG['dt_range'],\n",
|
|
" normalization_mode=CONFIG['normalization_mode'],\n",
|
|
" negative_factor=CONFIG['negative_factor'],\n",
|
|
" positive_factor=CONFIG['positive_factor'],\n",
|
|
")\n",
|
|
"\n",
|
|
"# Convert to pandas\n",
|
|
"df_full = pl_df.to_pandas()\n",
|
|
"print(f\"\\nFull dataset shape: {df_full.shape}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Prepare Train/Test Split"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Identify columns\n",
|
|
"feature_cols = [c for c in df_full.columns if c.startswith('alpha158_')]\n",
|
|
"target_cols = [c for c in df_full.columns if 'return' in c.lower()]\n",
|
|
"weight_cols = [c for c in df_full.columns if 'weight' in c.lower()]\n",
|
|
"\n",
|
|
"print(f\"Features: {len(feature_cols)}\")\n",
|
|
"print(f\"Targets: {target_cols}\")\n",
|
|
"print(f\"Weights: {weight_cols}\")\n",
|
|
"\n",
|
|
"# Select target\n",
|
|
"target_col = target_cols[0]\n",
|
|
"weight_col = weight_cols[0] if weight_cols else None\n",
|
|
"\n",
|
|
"# Split by date\n",
|
|
"df_train = df_full.loc[CONFIG['train_range'][0]:CONFIG['train_range'][1]]\n",
|
|
"df_test = df_full.loc[CONFIG['test_range'][0]:CONFIG['test_range'][1]]\n",
|
|
"\n",
|
|
"print(f\"\\nTrain: {df_train.shape}\")\n",
|
|
"print(f\"Test: {df_test.shape}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 4. Train Model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Prepare data\n",
|
|
"X_train = df_train[feature_cols]\n",
|
|
"y_train = df_train[target_col]\n",
|
|
"w_train = df_train[weight_col] if weight_col else None\n",
|
|
"\n",
|
|
"X_test = df_test[feature_cols]\n",
|
|
"y_test = df_test[target_col]\n",
|
|
"\n",
|
|
"# Handle missing values\n",
|
|
"X_train = X_train.fillna(X_train.median())\n",
|
|
"X_test = X_test.fillna(X_train.median()) # Use train median\n",
|
|
"\n",
|
|
"print(\"Training XGBoost model...\")\n",
|
|
"print(f\" X shape: {X_train.shape}\")\n",
|
|
"print(f\" y mean: {y_train.mean():.6f}, std: {y_train.std():.6f}\")\n",
|
|
"\n",
|
|
"model = xgb.XGBRegressor(**CONFIG['model_params'])\n",
|
|
"\n",
|
|
"model.fit(\n",
|
|
" X_train, y_train,\n",
|
|
" sample_weight=w_train,\n",
|
|
" eval_set=[(X_test, y_test)],\n",
|
|
" verbose=False\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"Training complete!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Feature importance\n",
|
|
"importance = pd.DataFrame({\n",
|
|
" 'feature': feature_cols,\n",
|
|
" 'importance': model.feature_importances_\n",
|
|
"}).sort_values('importance', ascending=False)\n",
|
|
"\n",
|
|
"print(\"\\nTop 10 Features:\")\n",
|
|
"print(importance.head(10))\n",
|
|
"\n",
|
|
"# Plot\n",
|
|
"fig, ax = plt.subplots(figsize=(10, 6))\n",
|
|
"importance.head(20).plot(x='feature', y='importance', kind='barh', ax=ax)\n",
|
|
"ax.set_title('Top 20 Feature Importance')\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 5. Evaluate"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Generate predictions\n",
|
|
"y_pred_train = model.predict(X_train)\n",
|
|
"y_pred_test = model.predict(X_test)\n",
|
|
"\n",
|
|
"# Calculate metrics\n",
|
|
"train_r2 = r2_score(y_train, y_pred_train)\n",
|
|
"test_r2 = r2_score(y_test, y_pred_test)\n",
|
|
"\n",
|
|
"# IC (Information Coefficient)\n",
|
|
"train_ic = np.corrcoef(y_train, y_pred_train)[0, 1]\n",
|
|
"test_ic = np.corrcoef(y_test, y_pred_test)[0, 1]\n",
|
|
"\n",
|
|
"print(\"Performance Metrics:\")\n",
|
|
"print(f\" Train R2: {train_r2:.4f}\")\n",
|
|
"print(f\" Test R2: {test_r2:.4f}\")\n",
|
|
"print(f\" Train IC: {train_ic:.4f}\")\n",
|
|
"print(f\" Test IC: {test_ic:.4f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Daily IC analysis\n",
|
|
"df_test_eval = df_test.copy()\n",
|
|
"df_test_eval['pred'] = y_pred_test\n",
|
|
"df_test_eval['target'] = y_test\n",
|
|
"\n",
|
|
"df_test_eval['datetime'] = df_test_eval.index.get_level_values(0)\n",
|
|
"\n",
|
|
"# Calculate daily IC\n",
|
|
"daily_ic = df_test_eval.groupby('datetime').apply(\n",
|
|
" lambda x: x['target'].corr(x['pred'])\n",
|
|
")\n",
|
|
"\n",
|
|
"print(\"\\nDaily IC Statistics:\")\n",
|
|
"print(f\" Mean: {daily_ic.mean():.4f}\")\n",
|
|
"print(f\" Std: {daily_ic.std():.4f}\")\n",
|
|
"print(f\" IR: {daily_ic.mean() / daily_ic.std():.4f}\")\n",
|
|
"print(f\" >0: {(daily_ic > 0).mean():.1%}\")\n",
|
|
"\n",
|
|
"# Plot\n",
|
|
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
|
|
"\n",
|
|
"# IC distribution\n",
|
|
"daily_ic.hist(bins=50, ax=axes[0], edgecolor='black')\n",
|
|
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
|
|
"axes[0].axvline(x=daily_ic.mean(), color='green', linestyle='--', label=f'Mean: {daily_ic.mean():.3f}')\n",
|
|
"axes[0].set_title('Daily IC Distribution')\n",
|
|
"axes[0].legend()\n",
|
|
"\n",
|
|
"# IC time series\n",
|
|
"daily_ic.rolling(20, min_periods=5).mean().plot(ax=axes[1])\n",
|
|
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
|
|
"axes[1].set_title('Rolling IC (20-day)')\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Prediction vs Actual scatter\n",
|
|
"fig, ax = plt.subplots(figsize=(8, 8))\n",
|
|
"\n",
|
|
"# Sample for plotting\n",
|
|
"sample_idx = np.random.choice(len(y_test), size=min(10000, len(y_test)), replace=False)\n",
|
|
"ax.scatter(y_test.iloc[sample_idx], y_pred_test[sample_idx], alpha=0.3, s=1)\n",
|
|
"\n",
|
|
"# Perfect prediction line\n",
|
|
"lims = [min(y_test.min(), y_pred_test.min()), max(y_test.max(), y_pred_test.max())]\n",
|
|
"ax.plot(lims, lims, 'r--', alpha=0.5)\n",
|
|
"\n",
|
|
"ax.set_xlabel('Actual')\n",
|
|
"ax.set_ylabel('Predicted')\n",
|
|
"ax.set_title(f'Prediction vs Actual (IC={test_ic:.3f})')\n",
|
|
"ax.grid(True, alpha=0.3)\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 6. Save Results"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"if CONFIG['save_results']:\n",
|
|
" import pickle\n",
|
|
" import json\n",
|
|
" \n",
|
|
" output_dir = create_experiment_dir('stock_15m', CONFIG['experiment_name'])\n",
|
|
" print(f\"Saving results to: {output_dir}\")\n",
|
|
" \n",
|
|
" # Save config\n",
|
|
" with open(output_dir / 'config.json', 'w') as f:\n",
|
|
" json.dump(CONFIG, f, indent=2, default=str)\n",
|
|
" \n",
|
|
" # Save model\n",
|
|
" with open(output_dir / 'model.pkl', 'wb') as f:\n",
|
|
" pickle.dump(model, f)\n",
|
|
" \n",
|
|
" # Save importance\n",
|
|
" importance.to_csv(output_dir / 'feature_importance.csv', index=False)\n",
|
|
" \n",
|
|
" # Save predictions\n",
|
|
" predictions = pd.DataFrame({\n",
|
|
" 'actual': y_test,\n",
|
|
" 'predicted': y_pred_test\n",
|
|
" }, index=df_test.index)\n",
|
|
" predictions.to_csv(output_dir / 'predictions.csv')\n",
|
|
" \n",
|
|
" # Save metrics\n",
|
|
" metrics = {\n",
|
|
" 'train_r2': float(train_r2),\n",
|
|
" 'test_r2': float(test_r2),\n",
|
|
" 'train_ic': float(train_ic),\n",
|
|
" 'test_ic': float(test_ic),\n",
|
|
" 'daily_ic_mean': float(daily_ic.mean()),\n",
|
|
" 'daily_ic_std': float(daily_ic.std()),\n",
|
|
" 'daily_ir': float(daily_ic.mean() / daily_ic.std()),\n",
|
|
" }\n",
|
|
" with open(output_dir / 'metrics.json', 'w') as f:\n",
|
|
" json.dump(metrics, f, indent=2)\n",
|
|
" \n",
|
|
" print(\"\\nFiles saved:\")\n",
|
|
" for f in output_dir.iterdir():\n",
|
|
" print(f\" - {f.name}\")\n",
|
|
"else:\n",
|
|
" print(\"Results not saved\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python",
|
|
"version": "3.8.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
} |