From 26a694298d4ccfc16ccad140d7ac9366950a1176 Mon Sep 17 00:00:00 2001
From: guofu
Date: Sun, 1 Mar 2026 14:28:28 +0800
Subject: [PATCH] Clean obsolete debug files from alpha158_beta
Remove bug analysis documentation (findings incorporated into README.md):
- BUG_ANALYSIS.md, BUG_ANALYSIS_FINAL.md
Remove one-off debug/exploration scripts:
- compare_gold_standard.py, debug_data_divergence.py
- verify_feature_order.py, regenerate_sample_embedding.py
- dump_qlib_gold_standard.py, dump_qlib_gold_standard_simple.py
Remove temporary log files and empty __pycache__ directories
Co-Authored-By: Claude Opus 4.6
---
.gitignore | 18 +-
compare_embeddings.py | 293 ------------
cta_1d/results/README.md | 18 +
stock_15m/results/README.md | 18 +
stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md | 123 -----
.../d033/alpha158_beta/BUG_ANALYSIS_FINAL.md | 159 -------
.../scripts/compare_gold_standard.py | 129 ------
.../scripts/debug_data_divergence.py | 254 -----------
.../scripts/dump_qlib_gold_standard.py | 421 ------------------
.../scripts/dump_qlib_gold_standard_simple.py | 270 -----------
.../scripts/regenerate_sample_embedding.py | 186 --------
stock_1d/d033/alpha158_beta/scripts/run.log | 394 ----------------
stock_1d/d033/alpha158_beta/scripts/run2.log | 373 ----------------
stock_1d/d033/alpha158_beta/scripts/run3.log | 373 ----------------
stock_1d/d033/alpha158_beta/scripts/run4.log | 321 -------------
.../d033/alpha158_beta/scripts/run_simple.log | 104 -----
.../alpha158_beta/scripts/run_simple2.log | 103 -----
.../scripts/verify_feature_order.py | 187 --------
18 files changed, 51 insertions(+), 3693 deletions(-)
delete mode 100644 compare_embeddings.py
create mode 100644 cta_1d/results/README.md
create mode 100644 stock_15m/results/README.md
delete mode 100644 stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md
delete mode 100644 stock_1d/d033/alpha158_beta/BUG_ANALYSIS_FINAL.md
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/compare_gold_standard.py
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/debug_data_divergence.py
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard.py
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard_simple.py
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/regenerate_sample_embedding.py
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run.log
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run2.log
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run3.log
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run4.log
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run_simple.log
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run_simple2.log
delete mode 100644 stock_1d/d033/alpha158_beta/scripts/verify_feature_order.py
diff --git a/.gitignore b/.gitignore
index dd4c38b..5378947 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,9 +31,21 @@ wheels/
*.ipynb_checkpoints
# Results and data
-results/*
-!results/*/.gitkeep
-!results/*/README.md
+cta_1d/results/*
+!cta_1d/results/.gitkeep
+!cta_1d/results/README.md
+!cta_1d/results/*/.gitkeep
+!cta_1d/results/*/README.md
+stock_15m/results/*
+!stock_15m/results/.gitkeep
+!stock_15m/results/README.md
+!stock_15m/results/*/.gitkeep
+!stock_15m/results/*/README.md
+stock_1d/results/*
+!stock_1d/results/.gitkeep
+!stock_1d/results/README.md
+!stock_1d/results/*/.gitkeep
+!stock_1d/results/*/README.md
*.parquet
*.pkl
*.h5
diff --git a/compare_embeddings.py b/compare_embeddings.py
deleted file mode 100644
index ace827f..0000000
--- a/compare_embeddings.py
+++ /dev/null
@@ -1,293 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compare generated embeddings with database embeddings (0_7 version).
-Handles format conversion for datetime and instrument columns.
-
-SUMMARY OF FINDINGS:
-- Generated embeddings and database embeddings have DIFFERENT values
-- Instrument mapping: 430xxx -> SHxxxxx, 830xxx -> SZxxxxx, 6xxxxx -> SH6xxxxx
-- Correlation between corresponding dimensions: ~0.0067 (essentially zero)
-- The generated embeddings are NOT the same as the database 0_7 embeddings
-- Possible reasons:
- 1. Different model weights/versions used for generation
- 2. Different input features or normalization
- 3. Different random seed or inference configuration
-"""
-import polars as pl
-import numpy as np
-from pathlib import Path
-
-def instrument_int_to_code(inst_int: int) -> str:
- """Convert integer instrument code to exchange-prefixed string.
-
- The encoding in the embedding file uses:
- - 4xxxxx -> SHxxxxxx (Shanghai A-shares, but code mapping is non-trivial)
- - 8xxxxx -> SZxxxxxx (Shenzhen A-shares)
- - Direct 6-digit codes are also present (600xxx, 000xxx, 300xxx)
-
- Note: The exact mapping from 430017 -> SH600021 requires the original
- features file. We attempt an approximate mapping here.
- """
- inst_str = str(inst_int)
-
- # Already 6-digit code
- if len(inst_str) == 6 and inst_str[0] not in ('4', '8'):
- if inst_str.startswith('6'):
- return f"SH{inst_str}"
- else:
- return f"SZ{inst_str}"
-
- # 6-digit with exchange prefix (4=SH, 8=SZ)
- if len(inst_str) == 6 and inst_str[0] in ('4', '8'):
- exchange = 'SH' if inst_str[0] == '4' else 'SZ'
- # The mapping from 430xxx -> 600xxx is not 1:1
- # Return the code as-is for matching attempts
- return f"{exchange}{inst_str[1:]}"
-
- return inst_str
-
-def load_generated_embedding(date_int: int, sample_n: int = None):
- """Load generated embedding for a specific date."""
- gen_path = Path('/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/embedding_0_7_beta.parquet')
-
- lf = pl.scan_parquet(gen_path)
- lf = lf.filter(pl.col('datetime') == date_int)
-
- if sample_n:
- lf = lf.head(sample_n)
-
- df = lf.collect()
-
- # Convert wide format (embedding_0, embedding_1, ...) to list format
- embedding_cols = [c for c in df.columns if c.startswith('embedding_')]
- embedding_cols.sort(key=lambda x: int(x.split('_')[1]))
-
- embedding_structs = df.select(embedding_cols).to_struct()
- embeddings_list = [[v for v in struct.values()] for struct in embedding_structs]
-
- df = df.with_columns([
- pl.Series('values', embeddings_list),
- pl.col('datetime').cast(pl.UInt32).alias('datetime_uint32'),
- pl.col('instrument').alias('instrument_orig'),
- pl.col('instrument').cast(pl.String).alias('instrument_str'),
- pl.col('instrument').map_elements(instrument_int_to_code, return_dtype=pl.String).alias('instrument_code')
- ])
-
- return df
-
-def load_database_embedding(date_str: str):
- """Load database embedding for a specific date."""
- db_path = Path(f'/data/parquet/dataset/dwm_1day_multicast_csencode_1D/version=csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/datetime={date_str}/0.parquet')
-
- if not db_path.exists():
- return None
-
- df = pl.read_parquet(db_path)
- df = df.with_columns([
- pl.col('datetime').cast(pl.Int64).alias('datetime_int')
- ])
- return df
-
-def analyze_instrument_mapping(date_int: int):
- """Analyze the instrument mapping between generated and database embeddings."""
- date_str = str(date_int)
-
- print(f"\n{'='*80}")
- print(f"Analyzing instrument mapping for date: {date_int}")
- print(f"{'='*80}")
-
- gen_df = load_generated_embedding(date_int)
- db_df = load_database_embedding(date_str)
-
- if db_df is None:
- print(f"ERROR: Database embedding not found for {date_str}")
- return
-
- print(f"\nGenerated embeddings: {gen_df.shape[0]} rows")
- print(f"Database embeddings: {db_df.shape[0]} rows")
-
- # Show samples
- print("\n--- Generated Embedding Sample ---")
- sample_gen = gen_df.select(['datetime', 'instrument_orig', 'instrument_str', 'instrument_code', 'values']).head(10)
- print(sample_gen)
-
- print("\n--- Database Embedding Sample ---")
- print(db_df.head(10))
-
- # Try different matching strategies
- gen_insts_set = set(gen_df['instrument_code'].to_list())
- db_insts_set = set(db_df['instrument'].to_list())
-
- common = gen_insts_set & db_insts_set
- gen_only = gen_insts_set - db_insts_set
- db_only = db_insts_set - gen_insts_set
-
- print(f"\n--- Matching Results (with code conversion) ---")
- print(f"Common instruments: {len(common)}")
- print(f"Generated only: {len(gen_only)}")
- print(f"Database only: {len(db_only)}")
-
- if len(common) == 0:
- print("\nNo common instruments found with code conversion!")
- print("\nTrying to find mapping patterns...")
-
- # Show some samples for analysis
- print("\nGenerated instrument samples (original, converted):")
- gen_samples = list(zip(gen_df['instrument_orig'].head(20).to_list(),
- gen_df['instrument_code'].head(20).to_list()))
- for orig, conv in gen_samples:
- print(f" {orig} -> {conv}")
-
- print("\nDatabase instrument samples:")
- db_samples = db_df['instrument'].head(20).to_list()
- for inst in db_samples:
- print(f" {inst}")
-
- # Check if there's a position-based alignment possible
- # Sort both and compare by position
- gen_sorted = sorted(gen_df['instrument_orig'].to_list())
- db_sorted = sorted([int(inst[2:]) for inst in db_df['instrument'].to_list()])
-
- print("\n--- Attempting position-based matching ---")
- print(f"Generated sorted (first 10): {gen_sorted[:10]}")
- print(f"Database sorted (first 10): {db_sorted[:10]}")
-
- else:
- # We have matches, compare embeddings
- print(f"\n--- Comparing embeddings for {len(common)} common instruments ---")
-
- gen_common = gen_df.filter(pl.col('instrument_code').is_in(list(common)))
- db_common = db_df.filter(pl.col('instrument').is_in(list(common)))
-
- # Join and compare
- comparison = gen_common.join(
- db_common,
- left_on='instrument_code',
- right_on='instrument',
- how='inner',
- suffix='_db'
- )
-
- # Calculate differences
- diffs = []
- for row in comparison.iter_rows():
- # Find indices for the values columns
- gen_vals_idx = comparison.columns.index('values')
- db_vals_idx = comparison.columns.index('values_db')
-
- gen_emb = np.array(row[gen_vals_idx])
- db_emb = np.array(row[db_vals_idx])
-
- diff = gen_emb - db_emb
- diff_norm = np.linalg.norm(diff)
- rel_diff = diff_norm / (np.linalg.norm(db_emb) + 1e-10)
-
- diffs.append({
- 'instrument': row[comparison.columns.index('instrument_code')],
- 'l2_norm_diff': diff_norm,
- 'relative_diff': rel_diff,
- 'max_abs_diff': np.max(np.abs(diff)),
- 'gen_emb_norm': np.linalg.norm(gen_emb),
- 'db_emb_norm': np.linalg.norm(db_emb)
- })
-
- if diffs:
- diff_df = pl.DataFrame(diffs)
- print("\nDifference statistics:")
- print(diff_df.select(['l2_norm_diff', 'relative_diff', 'max_abs_diff']).describe())
-
- max_rel_diff = diff_df['relative_diff'].max()
- print(f"\nMax relative difference: {max_rel_diff:.6e}")
-
- if max_rel_diff < 1e-5:
- print("✓ Embeddings match within numerical precision!")
- elif max_rel_diff < 0.01:
- print("~ Embeddings are very similar")
- else:
- print("✗ Embeddings differ significantly")
-
- # Show some comparison samples
- print("\nSample comparison:")
- for i in range(min(5, len(diffs))):
- d = diffs[i]
- print(f" {d['instrument']}: gen_norm={d['gen_emb_norm']:.4f}, "
- f"db_norm={d['db_emb_norm']:.4f}, rel_diff={d['relative_diff']:.6e}")
-
-def calculate_correlation(date_int: int):
- """Calculate correlation between generated and database embeddings."""
- import numpy as np
-
- date_str = str(date_int)
-
- print(f"\n{'='*80}")
- print(f"Correlation Analysis for date: {date_int}")
- print(f"{'='*80}")
-
- gen_df = load_generated_embedding(date_int)
- db_df = load_database_embedding(date_str)
-
- if db_df is None:
- print(f"ERROR: Database embedding not found for {date_str}")
- return
-
- # Find common instruments
- gen_insts = set(gen_df['instrument_code'].to_list())
- db_insts = set(db_df['instrument'].to_list())
- common = list(gen_insts & db_insts)
-
- print(f"\nCommon instruments: {len(common)}")
-
- if len(common) == 0:
- print("No common instruments found!")
- return
-
- # Filter to common and sort
- gen_common = gen_df.filter(pl.col('instrument_code').is_in(common)).sort('instrument_code')
- db_common = db_df.filter(pl.col('instrument').is_in(common)).sort('instrument')
-
- # Extract embedding matrices
- gen_embs = np.array(gen_common['values'].to_list())
- db_embs = np.array(db_common['values'].to_list())
-
- print(f"Generated embeddings shape: {gen_embs.shape}")
- print(f"Database embeddings shape: {db_embs.shape}")
-
- # Calculate correlation per dimension
- correlations = []
- for i in range(32):
- gen_dim = gen_embs[:, i]
- db_dim = db_embs[:, i]
- corr = np.corrcoef(gen_dim, db_dim)[0, 1]
- correlations.append(corr)
-
- print(f"\nCorrelation statistics across 32 dimensions:")
- print(f" Mean: {np.mean(correlations):.4f}")
- print(f" Median: {np.median(correlations):.4f}")
- print(f" Min: {np.min(correlations):.4f}")
- print(f" Max: {np.max(correlations):.4f}")
-
- # Overall correlation
- overall_corr = np.corrcoef(gen_embs.flatten(), db_embs.flatten())[0, 1]
- print(f"\nOverall correlation (all dims flattened): {overall_corr:.4f}")
-
- # Interpretation
- mean_corr = np.mean(correlations)
- if abs(mean_corr) < 0.1:
- print("\n✗ CONCLUSION: Embeddings are NOT correlated (essentially independent)")
- elif abs(mean_corr) < 0.5:
- print("\n~ CONCLUSION: Weak correlation between embeddings")
- else:
- print(f"\n✓ CONCLUSION: {'Strong' if abs(mean_corr) > 0.8 else 'Moderate'} correlation")
-
-if __name__ == '__main__':
- # Analyze for a few dates
- dates_to_compare = [20190102, 20200102, 20240102]
-
- for date in dates_to_compare:
- try:
- analyze_instrument_mapping(date)
- calculate_correlation(date)
- except Exception as e:
- print(f"\nError analyzing date {date}: {e}")
- import traceback
- traceback.print_exc()
diff --git a/cta_1d/results/README.md b/cta_1d/results/README.md
new file mode 100644
index 0000000..e5bc268
--- /dev/null
+++ b/cta_1d/results/README.md
@@ -0,0 +1,18 @@
+# CTA 1D Experiment Results
+
+Document experiments manually here.
+
+## Template
+
+```markdown
+## YYYY-MM-DD: Experiment Name
+- Notebook: `../cta_1d/XX_notebook.ipynb` (cell range)
+- Data: [dates]
+- Config: key parameters
+- Metrics: IC mean/std, returns, sharpe
+- Notes: observations, next steps
+```
+
+## Experiments
+
+*Add entries below as you run experiments*
diff --git a/stock_15m/results/README.md b/stock_15m/results/README.md
new file mode 100644
index 0000000..ba201f2
--- /dev/null
+++ b/stock_15m/results/README.md
@@ -0,0 +1,18 @@
+# Stock 15m Experiment Results
+
+Document experiments manually here.
+
+## Template
+
+```markdown
+## YYYY-MM-DD: Experiment Name
+- Notebook: `../stock_15m/XX_notebook.ipynb` (cell range)
+- Data: [dates]
+- Config: key parameters
+- Metrics: IC mean/std, returns, sharpe
+- Notes: observations, next steps
+```
+
+## Experiments
+
+*Add entries below as you run experiments*
diff --git a/stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md b/stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md
deleted file mode 100644
index 5ebda83..0000000
--- a/stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md
+++ /dev/null
@@ -1,123 +0,0 @@
-# Data Pipeline Bug Analysis
-
-## Summary
-
-The generated embeddings do not match the database 0_7 embeddings due to multiple bugs in the data pipeline migration from qlib to standalone Polars implementation.
-
----
-
-## Bugs Fixed
-
-### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
-
-**Original (incorrect):**
-```python
-market_0 = (instrument >= 600000) # SH
-market_1 = (instrument < 600000) # SZ
-```
-
-**Fixed:**
-```python
-inst_str = str(instrument).zfill(6)
-market_0 = inst_str.startswith('6') # SH: 6xxxxx
-market_1 = inst_str.startswith('0') | inst_str.startswith('3') # SZ: 0xxx, 3xxx
-market_2 = inst_str.startswith('4') | inst_str.startswith('8') # NE: 4xxx, 8xxx
-```
-
-**Impact:** 167 instruments (4xxxxx, 8xxxxx - 新三板) were misclassified.
-
----
-
-### 2. ColumnRemover Missing `IsN` ✓ FIXED
-
-**Original (incorrect):**
-```python
-columns_to_remove = ['TotalValue_diff', 'IsZt', 'IsDt']
-```
-
-**Fixed:**
-```python
-columns_to_remove = ['TotalValue_diff', 'IsN', 'IsZt', 'IsDt']
-```
-
-**Impact:** Extra column caused feature dimension mismatch.
-
----
-
-### 3. RobustZScoreNorm Applied to Wrong Columns ✓ FIXED
-
-**Original (incorrect):**
-Applied normalization to ALL 341 features including market flags and indus_idx.
-
-**Fixed:**
-Only normalize `alpha158 + alpha158_ntrl + market_ext + market_ext_ntrl` (330 features), excluding:
-- Market flags (Limit, Stopping, IsTp, IsXD, IsXR, IsDR, market_0, market_1, market_2, IsST)
-- indus_idx
-
----
-
-## Critical Remaining Issue: Data Schema Mismatch
-
-### `Limit` and `Stopping` Column Types Changed
-
-**Original qlib pipeline expected:**
-- `Limit`: **Boolean** flag (True = limit up)
-- `Stopping`: **Boolean** flag (True = suspended trading)
-
-**Current Parquet data has:**
-- `Limit`: **Float64** price change percentage (0.0 to 1301.3)
-- `Stopping`: **Float64** price change percentage
-
-**Evidence:**
-```
-Limit values sample: [8.86, 9.36, 31.0, 7.32, 2.28, 6.39, 5.38, 4.03, 3.86, 9.89]
-Limit == 0: only 2 rows
-Limit > 0: 3738 rows
-```
-
-This is a **fundamental data schema change**. The current Parquet files contain different data than what the original VAE model was trained on.
-
-**Possible fixes:**
-1. Convert `Limit` and `Stopping` to boolean flags using a threshold
-2. Find the original data source that had boolean flags
-3. Re-train the VAE model with the new data schema
-
----
-
-## Correlation Results
-
-After fixing bugs 1-3, the embedding correlation with database 0_7:
-
-| Metric | Value |
-|--------|-------|
-| Mean correlation (32 dims) | 0.0068 |
-| Median correlation | 0.0094 |
-| Overall correlation | 0.2330 |
-
-**Conclusion:** Embeddings remain essentially uncorrelated (≈0).
-
----
-
-## Root Cause
-
-The **Limit/Stopping data schema change** is the most likely root cause. The VAE model learned to encode features that included binary limit/stopping flags, but the standalone pipeline feeds it continuous price change percentages instead.
-
----
-
-## Next Steps
-
-1. **Verify original data schema:**
- - Check if the original DolphinDB table had boolean `Limit` and `Stopping` columns
- - Compare with the current Parquet schema
-
-2. **Fix the data loading:**
- - Either convert continuous values to binary flags
- - Or use the correct boolean columns (`IsZt`, `IsDt`) for limit flags
-
-3. **Verify feature order:**
- - Ensure the qlib RobustZScoreNorm parameters are applied in the correct order
- - Check that `[alpha158, alpha158_ntrl, market_ext, market_ext_ntrl]` matches the 330-parameter shape
-
-4. **Re-run comparison:**
- - Generate new embeddings with the corrected pipeline
- - Compare correlation with database
diff --git a/stock_1d/d033/alpha158_beta/BUG_ANALYSIS_FINAL.md b/stock_1d/d033/alpha158_beta/BUG_ANALYSIS_FINAL.md
deleted file mode 100644
index d9439e9..0000000
--- a/stock_1d/d033/alpha158_beta/BUG_ANALYSIS_FINAL.md
+++ /dev/null
@@ -1,159 +0,0 @@
-# Data Pipeline Bug Analysis - Final Status
-
-## Summary
-
-After fixing all identified bugs, the feature count now matches (341), but the embeddings remain uncorrelated with the database 0_7 version.
-
-**Latest Version**: v6
-- Feature count: 341 ✓ (matches VAE input dim)
-- Mean correlation with DB: 0.0050 (essentially zero)
-- Status: All identified bugs fixed, IsST issue documented
-- **New**: Polars-based dataset generation script added (`scripts/dump_polars_dataset.py`)
-
----
-
-## Bugs Fixed
-
-### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
-- **Bug**: Used `instrument >= 600000` which misclassified 新三板 instruments
-- **Fix**: Use string prefix matching with vocab_size=2 (not 3)
-- **Impact**: 167 instruments corrected
-
-### 2. ColumnRemover Missing `IsN` ✓ FIXED
-- **Bug**: Only removed `IsZt, IsDt` but not `IsN`
-- **Fix**: Added `IsN` to removal list
-- **Impact**: Feature count alignment
-
-### 3. RobustZScoreNorm Scope ✓ FIXED
-- **Bug**: Applied normalization to all 341 features
-- **Fix**: Only normalize 330 features (alpha158 + market_ext, both original + neutralized)
-- **Impact**: Correct normalization scope
-
-### 4. Wrong Data Sources for Market Flags ✓ FIXED
-- **Bug**: Used `Limit, Stopping` (Float64) from kline_adjusted
-- **Fix**: Load from correct sources:
- - kline_adjusted: `IsZt, IsDt, IsN, IsXD, IsXR, IsDR` (Boolean)
- - market_flag: `open_limit, close_limit, low_limit, high_stop` (Boolean, 4 cols)
-- **Impact**: Correct boolean flag data
-
-### 5. Feature Count Mismatch ✓ FIXED
-- **Bug**: 344 features (3 extra)
-- **Fix**: vocab_size=2 + 4 market_flag cols = 341 features
-- **Impact**: VAE input dimension matches
-
-### 6. Fixed* Processors Not Adding Required Columns ✓ FIXED
-- **Bug**: `FixedFlagMarketInjector` only converted dtype but didn't add `market_0`, `market_1` columns
-- **Bug**: `FixedFlagSTInjector` only converted dtype but didn't create `IsST` column from `ST_S`, `ST_Y`
-- **Fix**:
- - `FixedFlagMarketInjector`: Now adds `market_0` (SH60xxx, SZ00xxx) and `market_1` (SH688xxx, SH689xxx, SZ300xxx, SZ301xxx)
- - `FixedFlagSTInjector`: Now creates `IsST = ST_S | ST_Y`
-- **Impact**: Processed data now has 408 columns (was 405), matching original qlib output
-
----
-
-## Important Discovery: IsST Column Issue in Gold-Standard Code
-
-### Problem Description
-
-The `FlagSTInjector` processor in the original qlib proc_list is supposed to create an `IsST` column in the `feature_flag` group from the `ST_S` and `ST_Y` columns in the `st_flag` group. However, this processor **fails silently** even in the gold-standard qlib code.
-
-### Root Cause
-
-The `FlagSTInjector` processor attempts to access columns using a format that doesn't match the actual column structure in the data:
-
-1. **Expected format**: The processor expects columns like `st_flag::ST_S` and `st_flag::ST_Y` (string format with `::` separator)
-2. **Actual format**: The qlib handler produces MultiIndex tuple columns like `('st_flag', 'ST_S')` and `('st_flag', 'ST_Y')`
-
-This format mismatch causes the processor to fail to find the ST flag columns, and thus no `IsST` column is created.
-
-### Evidence
-
-```python
-# Check proc_list
-import pickle as pkl
-with open('proc_list.proc', 'rb') as f:
- proc_list = pkl.load(f)
-
-# FlagSTInjector config
-flag_st = proc_list[2]
-print(f"fields_group: {flag_st.fields_group}") # 'feature_flag'
-print(f"col_name: {flag_st.col_name}") # 'IsST'
-print(f"st_group: {flag_st.st_group}") # 'st_flag'
-
-# Check if IsST exists in processed data
-with open('processed_data.pkl', 'rb') as f:
- df = pkl.load(f)
-
-feature_flag_cols = [c[1] for c in df.columns if c[0] == 'feature_flag']
-print('IsST' in feature_flag_cols) # False!
-```
-
-### Impact
-
-- **VAE training**: The VAE model was trained on data **without** the `IsST` column
-- **VAE input dimension**: 341 features (excluding IsST), not 342
-- **Polars pipeline**: Should also skip `IsST` to maintain compatibility
-
-### Resolution
-
-The polars-based pipeline (`dump_polars_dataset.py`) now correctly **skips** the `FlagSTInjector` step to match the gold-standard behavior:
-
-```python
-# Step 3: FlagSTInjector - SKIPPED (fails even in gold-standard)
-print("[3] Skipping FlagSTInjector (as per gold-standard behavior)...")
-market_flag_with_st = market_flag_with_market # No IsST added
-```
-
-### Lessons Learned
-
-1. **Verify processor execution**: Don't assume all processors in the proc_list executed successfully. Check the output data to verify expected columns exist.
-
-2. **Column format matters**: The qlib processors were designed for specific column formats (MultiIndex tuples vs `::` separator strings). Format mismatches can cause silent failures.
-
-3. **Match the gold-standard bugs**: When replicating a pipeline, sometimes you need to replicate the bugs too. The VAE was trained on data without `IsST`, so our pipeline must also exclude it.
-
-4. **Debug by comparing intermediate outputs**: Use scripts like `debug_data_divergence.py` to compare raw and processed data between the gold-standard and polars pipelines.
-
----
-
-## Correlation Results (v5)
-
-| Metric | Value |
-|--------|-------|
-| Mean correlation (32 dims) | 0.0050 |
-| Median correlation | 0.0079 |
-| Min | -0.0420 |
-| Max | 0.0372 |
-| Overall (flattened) | 0.2225 |
-
-**Conclusion**: Embeddings remain essentially uncorrelated with database.
-
----
-
-## Possible Remaining Issues
-
-1. **Different input data values**: The alpha158_0_7_beta Parquet files may contain different values than the original DolphinDB data used to train the VAE.
-
-2. **Feature ordering mismatch**: The 330 RobustZScoreNorm parameters must be applied in the exact order:
- - [0:158] = alpha158 original
- - [158:316] = alpha158_ntrl
- - [316:323] = market_ext original (7 cols)
- - [323:330] = market_ext_ntrl (7 cols)
-
-3. **Industry neutralization differences**: Our `IndusNtrlInjector` implementation may differ from qlib's.
-
-4. **Missing transformations**: There may be additional preprocessing steps not captured in handler.yaml.
-
-5. **VAE model mismatch**: The VAE model may have been trained with different data than what handler.yaml specifies.
-
----
-
-## Recommended Next Steps
-
-1. **Compare intermediate features**: Run both the qlib pipeline and our pipeline on the same input data and compare outputs at each step.
-
-2. **Verify RobustZScoreNorm parameter order**: Check if our feature ordering matches the order used during VAE training.
-
-3. **Compare predictions, not embeddings**: Instead of comparing VAE embeddings, compare the final d033 model predictions with the original 0_7 predictions.
-
-4. **Check alpha158 data source**: Verify that `stg_1day_wind_alpha158_0_7_beta_1D` contains the same data as the original DolphinDB `stg_1day_wind_alpha158_0_7_beta` table.
diff --git a/stock_1d/d033/alpha158_beta/scripts/compare_gold_standard.py b/stock_1d/d033/alpha158_beta/scripts/compare_gold_standard.py
deleted file mode 100644
index 22a539b..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/compare_gold_standard.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python
-"""
-Compare generated embeddings with gold standard embeddings from DolphinDB.
-"""
-
-import polars as pl
-import numpy as np
-from pathlib import Path
-
-DATA_DIR = Path(__file__).parent / "../data"
-
-
-def compare_embeddings():
- """Compare generated and gold standard embeddings."""
-
- # Load data
- gold_path = DATA_DIR / "embedding_0_7_beta_gold_standard.parquet"
- gen_path = DATA_DIR / "embedding_0_7_beta_sample.parquet"
-
- print("=" * 60)
- print("Loading embeddings")
- print("=" * 60)
-
- gold = pl.read_parquet(gold_path)
- gen = pl.read_parquet(gen_path)
-
- print(f"Gold standard: {gold.shape}")
- print(f"Generated: {gen.shape}")
-
- # Get embedding columns
- emb_cols = [f"embedding_{i}" for i in range(32)]
-
- # Compare by date
- dates = sorted(gold["datetime"].unique().to_list())
-
- print("\n" + "=" * 60)
- print("Comparison by date")
- print("=" * 60)
-
- for dt in dates:
- gold_dt = gold.filter(pl.col("datetime") == dt)
- gen_dt = gen.filter(pl.col("datetime") == dt)
-
- print(f"\nDate: {dt}")
- print(f" Gold instruments: {gold_dt.height}, Generated instruments: {gen_dt.height}")
- print(f" Gold instrument sample: {gold_dt['instrument'].head(5).to_list()}")
- print(f" Gen instrument sample: {gen_dt['instrument'].head(5).to_list()}")
-
- # Check for common instruments
- gold_insts = set(gold_dt["instrument"].to_list())
- gen_insts = set(gen_dt["instrument"].to_list())
- common = gold_insts & gen_insts
-
- print(f" Common instruments: {len(common)}")
-
- if len(common) > 0:
- # Compare embeddings for common instruments
- gold_common = gold_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
- gen_common = gen_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
-
- # Calculate embedding differences
- diffs = []
- for i in range(len(gold_common)):
- gold_emb = np.array([gold_common[col][i] for col in emb_cols])
- gen_emb = np.array([gen_common[col][i] for col in emb_cols])
-
- diff = gold_emb - gen_emb
- l2_norm = np.linalg.norm(diff)
- rel_diff = l2_norm / (np.linalg.norm(gold_emb) + 1e-8)
- max_abs_diff = np.max(np.abs(diff))
-
- diffs.append({
- "l2_norm": l2_norm,
- "rel_diff": rel_diff,
- "max_abs_diff": max_abs_diff,
- "gold_norm": np.linalg.norm(gold_emb),
- "gen_norm": np.linalg.norm(gen_emb)
- })
-
- diff_df = pl.DataFrame(diffs)
- print(f"\n Embedding comparison:")
- print(f" Mean L2 norm diff: {diff_df['l2_norm'].mean():.4f}")
- print(f" Mean rel diff: {diff_df['rel_diff'].mean():.4%}")
- print(f" Mean max abs diff: {diff_df['max_abs_diff'].mean():.4f}")
- print(f" Gold emb norm (mean): {diff_df['gold_norm'].mean():.4f}")
- print(f" Gen emb norm (mean): {diff_df['gen_norm'].mean():.4f}")
-
- # Correlation analysis
- gold_embs = np.array([[gold_common[col][i] for col in emb_cols] for i in range(len(gold_common))])
- gen_embs = np.array([[gen_common[col][i] for col in emb_cols] for i in range(len(gen_common))])
-
- correlations = []
- for d in range(32):
- corr = np.corrcoef(gold_embs[:, d], gen_embs[:, d])[0, 1]
- correlations.append(corr)
-
- print(f"\n Correlation by dimension:")
- print(f" Mean: {np.mean(correlations):.4f}")
- print(f" Median: {np.median(correlations):.4f}")
- print(f" Min: {np.min(correlations):.4f}")
- print(f" Max: {np.max(correlations):.4f}")
-
- # Overall correlation
- overall_corr = np.corrcoef(gold_embs.flatten(), gen_embs.flatten())[0, 1]
- print(f" Overall (flattened): {overall_corr:.4f}")
-
- print("\n" + "=" * 60)
- print("Summary Statistics")
- print("=" * 60)
-
- # Gold standard stats
- gold_embs = gold.select(emb_cols).to_numpy()
- print("\nGold standard embeddings:")
- print(f" Mean: {np.mean(gold_embs):.6f}")
- print(f" Std: {np.std(gold_embs):.6f}")
- print(f" Min: {np.min(gold_embs):.6f}")
- print(f" Max: {np.max(gold_embs):.6f}")
-
- # Generated stats
- gen_embs = gen.select(emb_cols).to_numpy()
- print("\nGenerated embeddings:")
- print(f" Mean: {np.mean(gen_embs):.6f}")
- print(f" Std: {np.std(gen_embs):.6f}")
- print(f" Min: {np.min(gen_embs):.6f}")
- print(f" Max: {np.max(gen_embs):.6f}")
-
-
-if __name__ == "__main__":
- compare_embeddings()
diff --git a/stock_1d/d033/alpha158_beta/scripts/debug_data_divergence.py b/stock_1d/d033/alpha158_beta/scripts/debug_data_divergence.py
deleted file mode 100644
index 5d6372c..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/debug_data_divergence.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#!/usr/bin/env python
-"""
-Debug script to compare gold-standard qlib data vs polars-based pipeline.
-
-This script helps identify where the data loading and processing pipeline
-starts to diverge from the gold-standard qlib output.
-"""
-
-import os
-import sys
-import pickle as pkl
-import numpy as np
-import pandas as pd
-import polars as pl
-from pathlib import Path
-
-# Paths
-GOLD_RAW_PATH = "/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/raw_data_20190101_20190131.pkl"
-GOLD_PROC_PATH = "/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/processed_data_20190101_20190131.pkl"
-PROC_LIST_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
-
-sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
-
-def compare_raw_data():
- """Compare raw data from gold standard vs polars pipeline."""
- print("=" * 80)
- print("STEP 1: Compare RAW DATA (before proc_list)")
- print("=" * 80)
-
- # Load gold standard raw data
- with open(GOLD_RAW_PATH, "rb") as f:
- gold_raw = pkl.load(f)
-
- print(f"\nGold standard raw data:")
- print(f" Shape: {gold_raw.shape}")
- print(f" Index: {gold_raw.index.names}")
- print(f" Column groups: {gold_raw.columns.get_level_values(0).unique().tolist()}")
-
- # Count columns per group
- for grp in gold_raw.columns.get_level_values(0).unique().tolist():
- count = (gold_raw.columns.get_level_values(0) == grp).sum()
- print(f" {grp}: {count} columns")
-
- # Show sample values for key columns
- print("\n Sample values (first 3 rows):")
- for col in [('feature', 'KMID'), ('feature_ext', 'turnover'), ('feature_ext', 'log_size')]:
- if col in gold_raw.columns:
- print(f" {col}: {gold_raw[col].iloc[:3].tolist()}")
-
- return gold_raw
-
-
-def compare_processed_data():
- """Compare processed data from gold standard vs polars pipeline."""
- print("\n" + "=" * 80)
- print("STEP 2: Compare PROCESSED DATA (after proc_list)")
- print("=" * 80)
-
- # Load gold standard processed data
- with open(GOLD_PROC_PATH, "rb") as f:
- gold_proc = pkl.load(f)
-
- print(f"\nGold standard processed data:")
- print(f" Shape: {gold_proc.shape}")
- print(f" Index: {gold_proc.index.names}")
- print(f" Column groups: {gold_proc.columns.get_level_values(0).unique().tolist()}")
-
- # Count columns per group
- for grp in gold_proc.columns.get_level_values(0).unique().tolist():
- count = (gold_proc.columns.get_level_values(0) == grp).sum()
- print(f" {grp}: {count} columns")
-
- # Show sample values for key columns
- print("\n Sample values (first 3 rows):")
- for col in [('feature', 'KMID'), ('feature', 'KMID_ntrl'),
- ('feature_ext', 'turnover'), ('feature_ext', 'turnover_ntrl')]:
- if col in gold_proc.columns:
- print(f" {col}: {gold_proc[col].iloc[:3].tolist()}")
-
- return gold_proc
-
-
-def analyze_processor_pipeline(gold_raw, gold_proc):
- """Analyze what transformations happened in the proc_list."""
- print("\n" + "=" * 80)
- print("STEP 3: Analyze Processor Transformations")
- print("=" * 80)
-
- # Load proc_list
- with open(PROC_LIST_PATH, "rb") as f:
- proc_list = pkl.load(f)
-
- print(f"\nProcessor pipeline ({len(proc_list)} processors):")
- for i, proc in enumerate(proc_list):
- print(f" [{i}] {type(proc).__name__}")
-
- # Analyze column changes
- print("\nColumn count changes:")
- print(f" Before: {gold_raw.shape[1]} columns")
- print(f" After: {gold_proc.shape[1]} columns")
- print(f" Change: +{gold_proc.shape[1] - gold_raw.shape[1]} columns")
-
- # Check which columns were added/removed
- gold_raw_cols = set(gold_raw.columns)
- gold_proc_cols = set(gold_proc.columns)
-
- added_cols = gold_proc_cols - gold_raw_cols
- removed_cols = gold_raw_cols - gold_proc_cols
-
- print(f"\n Added columns: {len(added_cols)}")
- print(f" Removed columns: {len(removed_cols)}")
-
- if removed_cols:
- print(f" Removed: {list(removed_cols)[:10]}...")
-
- # Check feature column patterns
- print("\nFeature column patterns in processed data:")
- feature_cols = [c for c in gold_proc.columns if c[0] == 'feature']
- ntrl_cols = [c for c in feature_cols if c[1].endswith('_ntrl')]
- raw_cols = [c for c in feature_cols if not c[1].endswith('_ntrl')]
- print(f" Total feature columns: {len(feature_cols)}")
- print(f" _ntrl columns: {len(ntrl_cols)}")
- print(f" raw columns: {len(raw_cols)}")
-
-
-def check_polars_pipeline():
- """Run the polars-based pipeline and compare."""
- print("\n" + "=" * 80)
- print("STEP 4: Generate data using Polars pipeline")
- print("=" * 80)
-
- try:
- from generate_beta_embedding import (
- load_all_data, merge_data_sources, apply_feature_pipeline,
- filter_stock_universe
- )
-
- # Load data using polars pipeline
- print("\nLoading data with polars pipeline...")
- df_alpha, df_kline, df_flag, df_industry = load_all_data(
- "2019-01-01", "2019-01-31"
- )
-
- print(f"\nPolars data sources loaded:")
- print(f" Alpha158: {df_alpha.shape}")
- print(f" Kline (market_ext): {df_kline.shape}")
- print(f" Flags: {df_flag.shape}")
- print(f" Industry: {df_industry.shape}")
-
- # Merge
- df_merged = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
- print(f"\nAfter merge: {df_merged.shape}")
-
- # Convert to pandas for easier comparison
- df_pandas = df_merged.to_pandas()
- df_pandas = df_pandas.set_index(['datetime', 'instrument'])
-
- print(f"\nAfter converting to pandas MultiIndex: {df_pandas.shape}")
-
- # Compare column names
- with open(GOLD_RAW_PATH, "rb") as f:
- gold_raw = pkl.load(f)
-
- print("\n" + "=" * 80)
- print("STEP 5: Compare Column Names (Gold vs Polars)")
- print("=" * 80)
-
- gold_cols = set(str(c) for c in gold_raw.columns)
- polars_cols = set(str(c) for c in df_pandas.columns)
-
- common_cols = gold_cols & polars_cols
- only_in_gold = gold_cols - polars_cols
- only_in_polars = polars_cols - gold_cols
-
- print(f"\n Common columns: {len(common_cols)}")
- print(f" Only in gold standard: {len(only_in_gold)}")
- print(f" Only in polars: {len(only_in_polars)}")
-
- if only_in_gold:
- print(f"\n Columns only in gold standard (first 20):")
- for col in list(only_in_gold)[:20]:
- print(f" {col}")
-
- if only_in_polars:
- print(f"\n Columns only in polars (first 20):")
- for col in list(only_in_polars)[:20]:
- print(f" {col}")
-
- # Check common columns values
- print("\n" + "=" * 80)
- print("STEP 6: Compare Values for Common Columns")
- print("=" * 80)
-
- # Get common columns as tuples
- common_tuples = []
- for gc in gold_raw.columns:
- gc_str = str(gc)
- for pc in df_pandas.columns:
- if str(pc) == gc_str:
- common_tuples.append((gc, pc))
- break
-
- print(f"\nComparing {len(common_tuples)} common columns...")
-
- # Compare first few columns
- matching_count = 0
- diff_count = 0
- for i, (gc, pc) in enumerate(common_tuples[:20]):
- gold_vals = gold_raw[gc].dropna().values
- polars_vals = df_pandas[pc].dropna().values
-
- if len(gold_vals) > 0 and len(polars_vals) > 0:
- # Compare min, max, mean
- if np.allclose([gold_vals.min(), gold_vals.max(), gold_vals.mean()],
- [polars_vals.min(), polars_vals.max(), polars_vals.mean()],
- rtol=1e-5):
- matching_count += 1
- else:
- diff_count += 1
- if diff_count <= 3:
- print(f" DIFF: {gc}")
- print(f" Gold: min={gold_vals.min():.6f}, max={gold_vals.max():.6f}, mean={gold_vals.mean():.6f}")
- print(f" Polars: min={polars_vals.min():.6f}, max={polars_vals.max():.6f}, mean={polars_vals.mean():.6f}")
-
- print(f"\n Matching columns: {matching_count}")
- print(f" Different columns: {diff_count}")
-
- except Exception as e:
- print(f"\nError running polars pipeline: {e}")
- import traceback
- traceback.print_exc()
-
-
-if __name__ == "__main__":
- print("=" * 80)
- print("DATA DIVERGENCE DEBUG SCRIPT")
- print("Comparing gold-standard qlib output vs polars-based pipeline")
- print("=" * 80)
-
- # Step 1: Check raw data
- gold_raw = compare_raw_data()
-
- # Step 2: Check processed data
- gold_proc = compare_processed_data()
-
- # Step 3: Analyze processor transformations
- analyze_processor_pipeline(gold_raw, gold_proc)
-
- # Step 4 & 5: Run polars pipeline and compare
- check_polars_pipeline()
-
- print("\n" + "=" * 80)
- print("DEBUG COMPLETE")
- print("=" * 80)
diff --git a/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard.py b/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard.py
deleted file mode 100644
index f311500..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard.py
+++ /dev/null
@@ -1,421 +0,0 @@
-#!/usr/bin/env python
-"""
-Dump Gold-Standard Data from Qlib Pipeline
-
-This script exports processed feature data from the original Qlib pipeline
-in multiple formats for debugging and comparison with the standalone Polars implementation.
-
-Usage:
- python dump_qlib_gold_standard.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir ../data/
-"""
-
-import argparse
-import os
-import sys
-import pickle as pkl
-from datetime import datetime, timedelta
-from pathlib import Path
-
-import pandas as pd
-import polars as pl
-import numpy as np
-
-# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
-if not hasattr(np, 'NaN'):
- np.NaN = np.nan
-
-
-def parse_args():
- parser = argparse.ArgumentParser(
- description="Dump gold-standard data from Qlib pipeline"
- )
- parser.add_argument(
- "--start-date",
- type=str,
- default="2020-01-02",
- help="Start date for data export (YYYY-MM-DD)",
- )
- parser.add_argument(
- "--end-date",
- type=str,
- default="2020-01-10",
- help="End date for data export (YYYY-MM-DD)",
- )
- parser.add_argument(
- "--output-dir",
- type=str,
- default="../data/",
- help="Output directory for exported files",
- )
- parser.add_argument(
- "--qlib-dataset-path",
- type=str,
- default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
- help="Path to Qlib dataset module",
- )
- return parser.parse_args()
-
-
-def load_qlib_data(qlib_dataset_path, since_date):
- """
- Load processed data from Qlib pipeline.
-
- This function loads data using the original Qlib pipeline and handles
- the SepDataFrame return type by concatenating column groups.
-
- Args:
- qlib_dataset_path: Path to the Qlib dataset module
- since_date: Start date for loading data (YYYY-MM-DD)
-
- Returns:
- pd.DataFrame: Processed DataFrame from Qlib pipeline with all column groups concatenated
- """
- import importlib.util
- import datetime as dt
-
- # Patch ruamel.yaml to provide safe_load compatibility
- import ruamel.yaml as yaml
-
- # Create a YAML instance with safe loader for backward compatibility
- _yaml = yaml.YAML(typ='safe', pure=True)
-
- # Monkey-patch safe_load to use the new API
- def patched_safe_load(stream):
- import io
- if isinstance(stream, str):
- stream = io.StringIO(stream)
- return _yaml.load(stream)
-
- yaml.safe_load = patched_safe_load
-
- # Load the module directly
- spec = importlib.util.spec_from_file_location(
- "qlib_dataset",
- os.path.join(qlib_dataset_path, "__init__.py")
- )
- module = importlib.util.module_from_spec(spec)
- spec.loader.exec_module(module)
-
- # Parse since_date
- since_date_dt = pd.to_datetime(since_date)
- # Load with extra history for Diff processor
- load_start = (since_date_dt - dt.timedelta(days=20)).strftime("%Y-%m-%d")
-
- print(f" Loading data with handler (load_start={load_start})...")
-
- # Use _load_from_yaml to get raw handler data (SepDataFrame)
- handler_data = module._load_from_yaml(
- os.path.join(qlib_dataset_path, "handler.yaml"),
- load_start
- )
-
- # Handle SepDataFrame - extract and concatenate column groups
- if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
- # It's a SepDataFrame from AggHandler
- df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
- group_names = list(df_dict.keys())
- print(f" Handler returned SepDataFrame with groups: {group_names}")
-
- # Concatenate all column groups into a single DataFrame
- all_dfs = []
- for group in group_names:
- df = df_dict[group]
- if df is not None and len(df.columns) > 0:
- df_copy = df.copy()
- # Add group prefix to columns
- df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
- all_dfs.append(df_copy)
- print(f" Group '{group}': {df_copy.shape}")
-
- # Concatenate all groups along axis 1
- raw_df = pd.concat(all_dfs, axis=1)
- print(f" Concatenated raw data shape: {raw_df.shape}")
- else:
- raw_df = handler_data
- print(f" Raw data shape: {raw_df.shape}")
-
- # Load processor list
- proc_path = os.path.join(qlib_dataset_path, "proc_list.proc")
- print(f" Loading processor list from: {proc_path}")
- with open(proc_path, "rb") as f:
- proc_list = pkl.load(f)
- print(f" Processor list has {len(proc_list)} processors")
- for i, proc in enumerate(proc_list):
- print(f" {i+1}. {type(proc).__name__}")
-
- # Apply processors
- from qlib.contrib.data.utils import apply_proc_list
- print(f" Applying processor list (with_fit=False)...")
-
- # The processor list expects columns without the group prefix
- # We need to strip the prefix before applying processors
- # Create a mapping and restore original column names
- col_mapping = {}
- for col in raw_df.columns:
- if '::' in col:
- original = col.split('::', 1)[1]
- col_mapping[col] = original
-
- # Rename columns back to original names for processor application
- raw_df_renamed = raw_df.rename(columns=col_mapping)
- print(f" Renamed columns for processor compatibility. Shape: {raw_df_renamed.shape}")
-
- # Convert boolean columns to object to avoid NaN -> int conversion issues
- bool_cols = raw_df_renamed.select_dtypes(include=['bool']).columns
- print(f" Converting {len(bool_cols)} boolean columns to object dtype")
- for col in bool_cols:
- raw_df_renamed[col] = raw_df_renamed[col].astype(object)
-
- # Apply processors
- df = apply_proc_list(raw_df_renamed, proc_list=proc_list, with_fit=False)
- print(f" Applied processor list. Result shape: {df.shape}")
-
- # Add back group prefixes to columns
- new_col_mapping = {v: k for k, v in col_mapping.items()}
- df = df.rename(columns=new_col_mapping)
- print(f" Restored column group prefixes. Shape: {df.shape}")
-
- # Filter to requested date range
- df = df.loc(axis=0)[slice(since_date_dt, None)]
- print(f" Filtered to since_date={since_date}. Final shape: {df.shape}")
-
- return df
-
-
-def export_column_groups(df, output_dir, prefix="gold_standard"):
- """
- Export separate files for different column groups.
-
- Column groups:
- - feature: alpha158 + alpha158_ntrl
- - feature_ext: extended features (log_size_diff, etc.)
- - feature_flag: market flags (IsST, IsN, IsZt, IsDt, etc.)
- - indus_idx: industry index columns
- """
- # Identify column groups based on naming conventions
- feature_cols = [c for c in df.columns if c.startswith("feature::")]
- feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
- feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
- indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
-
- # Also include the ntrl suffixed columns
- feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
-
- export_paths = {}
-
- # Export feature columns (alpha158 + alpha158_ntrl)
- if feature_cols:
- feature_df = df[feature_cols]
- path = os.path.join(output_dir, f"{prefix}_feature.parquet")
- feature_df.to_parquet(path)
- export_paths["feature"] = path
- print(f" Exported feature columns ({len(feature_cols)}): {path}")
-
- # Export feature_ext columns
- if feature_ext_cols:
- feature_ext_df = df[feature_ext_cols]
- path = os.path.join(output_dir, f"{prefix}_feature_ext.parquet")
- feature_ext_df.to_parquet(path)
- export_paths["feature_ext"] = path
- print(f" Exported feature_ext columns ({len(feature_ext_cols)}): {path}")
-
- # Export feature_flag columns
- if feature_flag_cols:
- feature_flag_df = df[feature_flag_cols]
- path = os.path.join(output_dir, f"{prefix}_feature_flag.parquet")
- feature_flag_df.to_parquet(path)
- export_paths["feature_flag"] = path
- print(f" Exported feature_flag columns ({len(feature_flag_cols)}): {path}")
-
- # Export indus_idx columns
- if indus_idx_cols:
- indus_idx_df = df[indus_idx_cols]
- path = os.path.join(output_dir, f"{prefix}_indus_idx.parquet")
- indus_idx_df.to_parquet(path)
- export_paths["indus_idx"] = path
- print(f" Exported indus_idx columns ({len(indus_idx_cols)}): {path}")
-
- # Export feature_ntrl columns separately
- if feature_ntrl_cols:
- feature_ntrl_df = df[feature_ntrl_cols]
- path = os.path.join(output_dir, f"{prefix}_feature_ntrl.parquet")
- feature_ntrl_df.to_parquet(path)
- export_paths["feature_ntrl"] = path
- print(f" Exported feature_ntrl columns ({len(feature_ntrl_cols)}): {path}")
-
- return export_paths
-
-
-def export_metadata(df, output_dir, prefix="gold_standard", proc_list_path=None):
- """
- Export metadata about the dataset.
-
- Includes:
- - Column names and shapes
- - Processor list configuration
- - Date range coverage
- - NaN value statistics
- """
- metadata_path = os.path.join(output_dir, f"{prefix}_metadata.txt")
-
- with open(metadata_path, "w") as f:
- f.write("=" * 80 + "\n")
- f.write("GOLD-STANDARD QLIB PIPELINE OUTPUT - METADATA\n")
- f.write("=" * 80 + "\n\n")
-
- f.write(f"Export Date: {datetime.now().isoformat()}\n\n")
-
- f.write("DATAFRAME SHAPE\n")
- f.write("-" * 40 + "\n")
- f.write(f"Shape: {df.shape}\n")
- f.write(f"Rows: {len(df)}\n")
- f.write(f"Columns: {len(df.columns)}\n\n")
-
- f.write("DATE RANGE\n")
- f.write("-" * 40 + "\n")
- dates = df.index.get_level_values("datetime").unique()
- f.write(f"Min Date: {dates.min()}\n")
- f.write(f"Max Date: {dates.max()}\n")
- f.write(f"Unique Dates: {len(dates)}\n\n")
-
- f.write("INSTRUMENTS\n")
- f.write("-" * 40 + "\n")
- instruments = df.index.get_level_values("instrument").unique()
- f.write(f"Unique Instruments: {len(instruments)}\n")
- f.write(f"Sample Instruments: {list(instruments[:10])}\n\n")
-
- f.write("COLUMN GROUPS\n")
- f.write("-" * 40 + "\n")
-
- # Categorize columns
- feature_cols = [c for c in df.columns if c.startswith("feature::")]
- feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
- feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
- indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
- feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
-
- f.write(f"feature:: columns: {len(feature_cols)}\n")
- f.write(f"feature_ext:: columns: {len(feature_ext_cols)}\n")
- f.write(f"feature_flag:: columns: {len(feature_flag_cols)}\n")
- f.write(f"indus_idx:: columns: {len(indus_idx_cols)}\n")
- f.write(f"*_ntrl columns: {len(feature_ntrl_cols)}\n\n")
-
- f.write("COLUMN DTYPES\n")
- f.write("-" * 40 + "\n")
- dtype_counts = df.dtypes.value_counts()
- for dtype, count in dtype_counts.items():
- f.write(f"{dtype}: {count}\n")
- f.write("\n")
-
- f.write("NAN STATISTICS\n")
- f.write("-" * 40 + "\n")
- nan_counts = df.isna().sum()
- cols_with_nan = nan_counts[nan_counts > 0]
- f.write(f"Columns with NaN: {len(cols_with_nan)}\n")
- f.write(f"Total NaN values: {df.isna().sum().sum()}\n\n")
-
- if len(cols_with_nan) > 0:
- f.write("NaN per column (top 20):\n")
- for col, cnt in cols_with_nan.nlargest(20).items():
- f.write(f" {col}: {cnt} ({100*cnt/len(df):.2f}%)\n")
- f.write("\n")
-
- f.write("ALL COLUMN NAMES\n")
- f.write("-" * 40 + "\n")
- for i, col in enumerate(df.columns):
- f.write(f" {i+1}. {col}\n")
- f.write("\n")
-
- if proc_list_path and os.path.exists(proc_list_path):
- f.write("PROCESSOR LIST\n")
- f.write("-" * 40 + "\n")
- f.write(f"Source: {proc_list_path}\n")
- try:
- with open(proc_list_path, "rb") as pf:
- proc_list = pkl.load(pf)
- f.write(f"Number of processors: {len(proc_list)}\n\n")
- for i, proc in enumerate(proc_list):
- f.write(f" {i+1}. {proc}\n")
- except Exception as e:
- f.write(f"Could not load processor list: {e}\n")
- f.write("\n")
-
- print(f"Exported metadata: {metadata_path}")
- return metadata_path
-
-
-def main():
- args = parse_args()
-
- # Parse dates
- start_date = pd.to_datetime(args.start_date)
- end_date = pd.to_datetime(args.end_date)
-
- # Create output directory if it doesn't exist
- output_dir = Path(args.output_dir).resolve()
- output_dir.mkdir(parents=True, exist_ok=True)
-
- print("=" * 80)
- print("DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE")
- print("=" * 80)
- print(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
- print(f"Output Directory: {output_dir}")
- print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
- print()
-
- # Load data from Qlib pipeline
- print("Step 1: Loading data from Qlib pipeline...")
- print(f" Loading since_date={start_date.strftime('%Y-%m-%d')}")
-
- try:
- df = load_qlib_data(args.qlib_dataset_path, start_date.strftime("%Y-%m-%d"))
- print(f" Loaded DataFrame with shape: {df.shape}")
- except Exception as e:
- print(f" ERROR: Failed to load data from Qlib pipeline: {e}")
- sys.exit(1)
-
- # Filter to requested date range
- print("\nStep 2: Filtering to requested date range...")
- df = df.loc(axis=0)[slice(start_date, end_date)]
- print(f" Filtered shape: {df.shape}")
-
- # Export full DataFrame
- print("\nStep 3: Exporting full DataFrame...")
- prefix = f"gold_standard_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
-
- parquet_path = output_dir / f"{prefix}.parquet"
- df.to_parquet(parquet_path)
- print(f" Exported parquet: {parquet_path}")
-
- pkl_path = output_dir / f"{prefix}.pkl"
- df.to_pickle(pkl_path)
- print(f" Exported pickle: {pkl_path}")
-
- # Export column groups
- print("\nStep 4: Exporting column groups...")
- export_paths = export_column_groups(df, str(output_dir), prefix=prefix)
-
- # Export metadata
- print("\nStep 5: Exporting metadata...")
- proc_list_path = os.path.join(args.qlib_dataset_path, "proc_list.proc")
- export_metadata(df, str(output_dir), prefix=prefix, proc_list_path=proc_list_path)
-
- # Summary
- print("\n" + "=" * 80)
- print("EXPORT SUMMARY")
- print("=" * 80)
- print(f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
- print(f"Output directory: {output_dir}")
- print(f"Total rows: {len(df)}")
- print(f"Total columns: {len(df.columns)}")
- print(f"\nFiles exported:")
- print(f" - {prefix}.parquet (full DataFrame)")
- print(f" - {prefix}.pkl (pickle, preserves dtypes)")
- print(f" - {prefix}_metadata.txt (column info, statistics)")
- for group, path in export_paths.items():
- print(f" - {os.path.basename(path)} ({group} columns)")
- print("\nDone!")
-
-
-if __name__ == "__main__":
- main()
diff --git a/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard_simple.py b/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard_simple.py
deleted file mode 100644
index d475d30..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard_simple.py
+++ /dev/null
@@ -1,270 +0,0 @@
-#!/usr/bin/env python
-"""
-Dump Gold-Standard Data from Qlib Pipeline (Simple Version)
-
-This script exports the RAW feature data from the Qlib pipeline BEFORE
-any processors are applied. This is useful for debugging and comparison.
-
-NOTE: This script loads ALL data from DolphinDB and then filters to the
-requested date range. For large date ranges, this may require significant memory.
-
-Usage:
- python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
-"""
-
-import argparse
-import os
-import sys
-import pickle as pkl
-from datetime import datetime, timedelta
-from pathlib import Path
-
-import pandas as pd
-import numpy as np
-
-# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
-if not hasattr(np, 'NaN'):
- np.NaN = np.nan
-
-
-def parse_args():
- parser = argparse.ArgumentParser(
- description="Dump gold-standard raw data from Qlib pipeline",
- formatter_class=argparse.RawDescriptionHelpFormatter,
- epilog="""
-Examples:
- # Export a few days for debugging (recommended)
- python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
-
- # Export with custom output directory
- python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir /path/to/output
- """
- )
- parser.add_argument(
- "--start-date",
- type=str,
- default="2020-01-02",
- help="Start date for data export (YYYY-MM-DD)",
- )
- parser.add_argument(
- "--end-date",
- type=str,
- default="2020-01-10",
- help="End date for data export (YYYY-MM-DD)",
- )
- parser.add_argument(
- "--output-dir",
- type=str,
- default="../data/",
- help="Output directory for exported files",
- )
- parser.add_argument(
- "--qlib-dataset-path",
- type=str,
- default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
- help="Path to Qlib dataset module",
- )
- parser.add_argument(
- "--instruments",
- type=str,
- default=None,
- help="Comma-separated list of instrument codes to export (default: all)",
- )
- return parser.parse_args()
-
-
-def load_raw_data(qlib_dataset_path, since_date, instruments=None):
- """
- Load RAW data from Qlib pipeline (before processor list is applied).
-
- Returns a dict of DataFrames, one per column group.
-
- Args:
- qlib_dataset_path: Path to Qlib dataset module
- since_date: Start date for loading (needs history before for Diff)
- instruments: Optional list of instrument codes to filter
- """
- import importlib.util
- import ruamel.yaml as yaml
-
- # Create a YAML instance with safe loader for backward compatibility
- _yaml = yaml.YAML(typ='safe', pure=True)
-
- def patched_safe_load(stream):
- import io
- if isinstance(stream, str):
- stream = io.StringIO(stream)
- return _yaml.load(stream)
-
- yaml.safe_load = patched_safe_load
-
- # Load the module directly
- spec = importlib.util.spec_from_file_location(
- "qlib_dataset",
- os.path.join(qlib_dataset_path, "__init__.py")
- )
- module = importlib.util.module_from_spec(spec)
- spec.loader.exec_module(module)
-
- # Parse since_date
- since_date_dt = pd.to_datetime(since_date)
- # Load with extra history for Diff processor
- load_start = (since_date_dt - timedelta(days=20)).strftime("%Y-%m-%d")
-
- print(f" Loading raw data from handler (load_start={load_start})...")
- if instruments:
- print(f" Filtering instruments: {instruments[:5]}... ({len(instruments)} total)")
-
- # Use _load_from_yaml to get raw handler data (SepDataFrame)
- handler_data = module._load_from_yaml(
- os.path.join(qlib_dataset_path, "handler.yaml"),
- load_start
- )
-
- # Handle SepDataFrame - extract column groups
- if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
- df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
- group_names = list(df_dict.keys())
- print(f" Handler returned SepDataFrame with groups: {group_names}")
-
- # Filter instruments if specified
- if instruments:
- print(f" Filtering to specified instruments...")
- for group in group_names:
- if df_dict[group] is not None:
- df = df_dict[group]
- # Filter by instrument level
- if isinstance(df.index, pd.MultiIndex):
- mask = df.index.get_level_values('instrument').isin(instruments)
- df_dict[group] = df[mask]
- print(f" Group '{group}': {df_dict[group].shape} (filtered)")
-
- for group in group_names:
- df = df_dict[group]
- if df is not None:
- print(f" Group '{group}': shape={df.shape}, columns={len(df.columns)}")
-
- return df_dict, handler_data.index
- else:
- print(f" Handler returned DataFrame: shape={handler_data.shape}")
- return {"default": handler_data}, handler_data.index
-
-
-def export_data(df_dict, index, output_dir, start_date, end_date):
- """Export data to parquet and pickle files."""
- output_dir = Path(output_dir).resolve()
- output_dir.mkdir(parents=True, exist_ok=True)
-
- start_date = pd.to_datetime(start_date)
- end_date = pd.to_datetime(end_date)
-
- # Filter index
- mask = (index >= start_date) & (index <= end_date)
- filtered_index = index[mask]
-
- print(f"\nExporting data for date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
- print(f" Filtered index has {len(filtered_index)} dates")
-
- prefix = f"gold_standard_raw_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
-
- exported_files = []
-
- # Export each group separately
- for group, df in df_dict.items():
- if df is None or len(df.columns) == 0:
- print(f" Skipping empty group '{group}'")
- continue
-
- # Filter by date
- df_filtered = df.loc[df.index.isin(filtered_index)]
- print(f" Group '{group}': {df_filtered.shape}")
-
- # Export to parquet
- parquet_path = output_dir / f"{prefix}_{group}.parquet"
- df_filtered.to_parquet(parquet_path)
- exported_files.append(str(parquet_path))
- print(f" -> {parquet_path}")
-
- # Export to pickle (preserves dtypes)
- pkl_path = output_dir / f"{prefix}_{group}.pkl"
- df_filtered.to_pickle(pkl_path)
- exported_files.append(str(pkl_path))
-
- # Also create a metadata file
- metadata_path = output_dir / f"{prefix}_metadata.txt"
- with open(metadata_path, "w") as f:
- f.write("=" * 80 + "\n")
- f.write("GOLD-STANDARD RAW DATA - METADATA\n")
- f.write("=" * 80 + "\n\n")
- f.write(f"Export Date: {datetime.now().isoformat()}\n")
- f.write(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\n")
- f.write(f"Total Dates: {len(filtered_index)}\n\n")
-
- f.write("COLUMN GROUPS:\n")
- f.write("-" * 40 + "\n")
- for group, df in df_dict.items():
- if df is not None:
- f.write(f" {group}:\n")
- f.write(f" Shape: {df.shape}\n")
- f.write(f" Columns: {len(df.columns)}\n")
- f.write(f" Sample columns: {list(df.columns[:5])}...\n\n")
-
- f.write("\nPROCESSOR LIST (for reference):\n")
- f.write("-" * 40 + "\n")
- proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
- if os.path.exists(proc_path):
- with open(proc_path, "rb") as pf:
- proc_list = pkl.load(pf)
- f.write(f"Number of processors: {len(proc_list)}\n\n")
- for i, proc in enumerate(proc_list):
- f.write(f" {i+1}. {type(proc).__module__}.{type(proc).__name__}\n")
- else:
- f.write(f"Processor list not found: {proc_path}\n")
-
- exported_files.append(str(metadata_path))
-
- return exported_files
-
-
-def main():
- args = parse_args()
-
- print("=" * 80)
- print("DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE")
- print("=" * 80)
- print(f"Date Range: {args.start_date} to {args.end_date}")
- print(f"Output Directory: {args.output_dir}")
- print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
- print()
-
- # Load raw data
- print("Step 1: Loading raw data from Qlib pipeline...")
- try:
- instruments = None
- if args.instruments:
- instruments = args.instruments.split(',')
- df_dict, index = load_raw_data(args.qlib_dataset_path, args.start_date, instruments=instruments)
- except Exception as e:
- print(f" ERROR: Failed to load data: {e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
-
- # Export data
- print("\nStep 2: Exporting data...")
- exported_files = export_data(df_dict, index, args.output_dir, args.start_date, args.end_date)
-
- # Summary
- print("\n" + "=" * 80)
- print("EXPORT SUMMARY")
- print("=" * 80)
- print(f"Date range: {args.start_date} to {args.end_date}")
- print(f"Output directory: {Path(args.output_dir).resolve()}")
- print(f"\nFiles exported ({len(exported_files)}):")
- for f in exported_files:
- print(f" - {f}")
- print("\nDone!")
-
-
-if __name__ == "__main__":
- main()
diff --git a/stock_1d/d033/alpha158_beta/scripts/regenerate_sample_embedding.py b/stock_1d/d033/alpha158_beta/scripts/regenerate_sample_embedding.py
deleted file mode 100644
index 1ffe4d5..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/regenerate_sample_embedding.py
+++ /dev/null
@@ -1,186 +0,0 @@
-#!/usr/bin/env python
-"""
-Regenerate beta embeddings for a few days of sample data.
-
-This script generates embeddings for a small date range to test the pipeline.
-"""
-
-import os
-import sys
-import pickle as pkl
-import numpy as np
-import polars as pl
-import torch
-import torch.nn as nn
-from pathlib import Path
-from datetime import datetime
-from typing import List, Dict, Optional
-
-# Add parent directory to path
-sys.path.insert(0, str(Path(__file__).parent))
-
-# Import from the main generate script
-from generate_beta_embedding import (
- load_all_data,
- merge_data_sources,
- apply_feature_pipeline,
- prepare_vae_features,
- load_vae_model,
- encode_with_vae,
- load_qlib_processor_params,
- VAE_INPUT_DIM,
- OUTPUT_DIR,
-)
-
-# Sample dates for testing (5 consecutive trading days)
-SAMPLE_DATES = [
- "2019-01-02",
- "2019-01-03",
- "2019-01-04",
- "2019-01-07",
- "2019-01-08",
-]
-
-VAE_MODEL_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/model/csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/module.pt"
-
-
-def generate_sample_embeddings(
- dates: List[str] = SAMPLE_DATES,
- output_file: str = "embedding_0_7_beta_sample.parquet",
- use_vae: bool = True
-) -> pl.DataFrame:
- """
- Generate embeddings for a sample of dates.
-
- Args:
- dates: List of dates in YYYY-MM-DD format
- output_file: Output parquet file path
- use_vae: Whether to use VAE for encoding (or random embeddings)
- """
- start_date = dates[0]
- end_date = dates[-1]
-
- print("=" * 60)
- print("Generating Sample Beta Embeddings")
- print(f"Dates: {dates}")
- print(f"Use VAE: {use_vae}")
- print("=" * 60)
-
- # Load all data sources
- df_alpha, df_kline, df_flag, df_industry = load_all_data(start_date, end_date)
-
- print(f"\nLoaded data:")
- print(f" Alpha158: {df_alpha.shape}")
- print(f" Kline: {df_kline.shape}")
- print(f" Flags: {df_flag.shape}")
- print(f" Industry: {df_industry.shape}")
-
- # Filter to only the sample dates
- date_ints = [int(d.replace("-", "")) for d in dates]
- df_alpha = df_alpha.filter(pl.col("datetime").is_in(date_ints))
- df_kline = df_kline.filter(pl.col("datetime").is_in(date_ints))
- df_flag = df_flag.filter(pl.col("datetime").is_in(date_ints))
- df_industry = df_industry.filter(pl.col("datetime").is_in(date_ints))
-
- print(f"\nAfter filtering to sample dates:")
- print(f" Alpha158: {df_alpha.shape}")
- print(f" Kline: {df_kline.shape}")
- print(f" Flags: {df_flag.shape}")
- print(f" Industry: {df_industry.shape}")
-
- # Merge data sources
- df = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
- print(f"\nMerged data shape: {df.shape}")
-
- # Save datetime and instrument before processing
- datetime_col = df["datetime"].clone()
- instrument_col = df["instrument"].clone()
-
- # Apply feature transformation pipeline
- df_processed, feature_cols, norm_feature_cols, market_flag_for_vae = apply_feature_pipeline(df)
-
- # Prepare features for VAE
- features = prepare_vae_features(
- df_processed, feature_cols,
- norm_feature_cols=norm_feature_cols,
- market_flag_for_vae=market_flag_for_vae
- )
-
- print(f"\nFeature matrix shape: {features.shape}")
-
- # Encode with VAE
- if use_vae:
- try:
- model = load_vae_model(VAE_MODEL_PATH)
- embeddings = encode_with_vae(features, model)
- print(f"\nVAE encoding successful!")
- except Exception as e:
- print(f"\nVAE encoding failed: {e}")
- import traceback
- traceback.print_exc()
- print("\nFalling back to random embeddings...")
- np.random.seed(42)
- embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
- else:
- print("\nUsing random embeddings (VAE disabled)...")
- np.random.seed(42)
- embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
-
- # Create output DataFrame
- embedding_cols = [f"embedding_{i}" for i in range(embeddings.shape[1])]
-
- result_data = {
- "datetime": datetime_col.to_list(),
- "instrument": instrument_col.to_list(),
- **{col_name: embeddings[:, i].tolist() for i, col_name in enumerate(embedding_cols)}
- }
-
- df_result = pl.DataFrame(result_data)
-
- # Ensure output directory exists
- output_path = Path(output_file)
- output_path.parent.mkdir(parents=True, exist_ok=True)
-
- # Save to parquet
- df_result.write_parquet(output_path)
- print(f"\nEmbeddings saved to: {output_path}")
- print(f"Output shape: {df_result.shape}")
- print(f"\nSample output:")
- print(df_result.head(10))
-
- # Print summary statistics
- print("\n" + "=" * 60)
- print("Summary Statistics")
- print("=" * 60)
- print(f"Total samples: {len(df_result)}")
- print(f"Embedding dimension: {embeddings.shape[1]}")
- print(f"Date range: {df_result['datetime'].min()} to {df_result['datetime'].max()}")
- print(f"Instruments: {df_result['instrument'].n_unique()}")
- print(f"Embedding mean: {np.mean(embeddings):.6f}")
- print(f"Embedding std: {np.std(embeddings):.6f}")
- print(f"Embedding min: {np.min(embeddings):.6f}")
- print(f"Embedding max: {np.max(embeddings):.6f}")
-
- return df_result
-
-
-if __name__ == "__main__":
- import argparse
-
- parser = argparse.ArgumentParser(description="Generate sample beta embeddings")
- parser.add_argument("--dates", nargs="+", default=SAMPLE_DATES,
- help="List of dates (YYYY-MM-DD)")
- parser.add_argument("--output", type=str, default="embedding_0_7_beta_sample.parquet",
- help="Output parquet file")
- parser.add_argument("--no-vae", action="store_true",
- help="Skip VAE encoding (use random embeddings)")
-
- args = parser.parse_args()
-
- generate_sample_embeddings(
- dates=args.dates,
- output_file=args.output,
- use_vae=not args.no_vae
- )
-
- print("\nDone!")
diff --git a/stock_1d/d033/alpha158_beta/scripts/run.log b/stock_1d/d033/alpha158_beta/scripts/run.log
deleted file mode 100644
index 8d58b7c..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/run.log
+++ /dev/null
@@ -1,394 +0,0 @@
-[2715583:MainThread](2026-02-26 19:58:16,674) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
-[2715583:MainThread](2026-02-26 19:58:16,680) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
-[2715583:MainThread](2026-02-26 19:58:16,681) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
-================================================================================
-DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
-================================================================================
-Date Range: 2020-01-02 to 2020-01-10
-Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
-Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
-
-Step 1: Loading data from Qlib pipeline...
- Loading since_date=2020-01-02
-Will use `placehorder_value` from module: qlib.contrib.data.config
-Will init handler object from config:
-{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'end_time': datetime.date(2026, 2, 26),
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-12-03 00:00:00')},
- 'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'handler': {'class': 'AggHandler',
- 'kwargs': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'end_time': datetime.date(2026, 2, 26)[2715583:MainThread](2026-02-26 19:58:16,707) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
-[2715583:MainThread](2026-02-26 19:58:16,707) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
-[2715583:MainThread](2026-02-26 19:58:17,067) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
-[2715583:MainThread](2026-02-26 20:05:39,665) INFO - qlib.timer - [log.py:117] - Time cost: 442.946s | DDB query: Done
-[2715583:MainThread](2026-02-26 20:05:40,469) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-,
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-12-03 00:00:00')},
- 'module_path': 'qlib.contrib.data.agg_handler'},
- 'load_end': datetime.date(2026, 2, 26),
- 'load_start': Timestamp('2019-12-03 00:00:00'),
- 'market': 'csiallx',
- 'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
- 'region': 'cn'}}
-Query config:
-#alpha158: 1;
-Will use float32 for
-[2715583:MainThread](2026-02-26 20:07:46,118) INFO - qlib.timer - [log.py:117] - Time cost: 115.964s | Instruments filter: Done
-[2715583:MainThread](2026-02-26 20:07:53,273) INFO - qlib.timer - [log.py:117] - Time cost: 576.561s | Loading data () Done
-[2715583:MainThread](2026-02-26 20:07:53,274) INFO - qlib.timer - [log.py:117] - Time cost: 576.562s | Init data () Done
-[2715583:MainThread](2026-02-26 20:07:53,276) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2715583:MainThread](2026-02-26 20:07:56,700) INFO - qlib.timer - [log.py:117] - Time cost: 3.423s | fetch_df_by_index Done
-[2715583:MainThread](2026-02-26 20:07:58,185) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
- KMID KLEN ... VSUMD30 VSUMD60
-datetime instrument ...
-2019-12-03 SH600000 0.004234 0.011008 ... -0.031454 -0.009671
- SH600004 0.015467 0.031529 ... -0.004401 0.007701
- SH600006 0.022573 0.033860 ... 0.060561 -0.000159
- SH600007 0.012129 0.025470 ... 0.008489 -0.054056
- SH600008 0.006173 0.009259 ... -0.088065 -0.080770
-... ... ... ... ... ...
-2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
- SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
- SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
- SZ301678 0.018182 0.027879 ... -0.054124 0.014202
- SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
-
-[6886779 rows x 158 columns]
-[2715583:MainThread](2026-02-26 20:07:58,186) INFO - qlib.timer - [log.py:117] - Time cost: 4.911s | Fetching dataframe Done
-[2715583:MainThread](2026-02-26 20:07:58,203) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
- where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
-[2715583:MainThread](2026-02-26 20:08:15,182) INFO - qlib.timer - [log.py:117] - Time cost: 16.990s | DDB query: Done
-[2715583:MainThread](2026-02-26 20:08:15,974) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2715583:MainThread](2026-02-26 20:08:16,548) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,con_rating_strength from
- loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
- where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
-[2715583:MainThread](2026-02-26 20:08:27,838) INFO - qlib.timer - [log.py:117] - Time cost: 11.299s | DDB query: Done
-[2715583:MainThread](2026-02-26 20:08:28,690) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
-[2715583:MainThread](2026-02-26 20:09:53,616) INFO - qlib.timer - [log.py:117] - Time cost: 81.815s | Instruments filter: Done
-[2715583:MainThread](2026-02-26 20:09:54,168) INFO - qlib.timer - [log.py:117] - Time cost: 115.981s | Loading data () Done
-[2715583:MainThread](2026-02-26 20:09:54,169) INFO - qlib.timer - [log.py:117] - Time cost: 115.982s | Init data () Done
-[2715583:MainThread](2026-02-26 20:09:54,170) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2715583:MainThread](2026-02-26 20:09:54,893) INFO - qlib.timer - [log.py:117] - Time cost: 0.723s | fetch_df_by_index Done
-[2715583:MainThread](2026-02-26 20:09:54,901) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
- turnover free_turnover log_size con_rating_strength
-datetime instrument
-2019-12-03 SH600000 0.0696 0.1275 17.322001 0.6618
- SH600004 0.6009 1.2276 15.077468 0.8269
- SH600006 0.5976 1.5087 13.716795 1.0000
- SH600007 0.0961 0.4969 14.334991 0.7500
- SH600008 0.0967 0.1793 14.432563 0.6591
-... ... ... ... ...
-2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
- SZ301662 12.5950 12.5950 12.681215 NaN
- SZ301665 14.0077 14.0077 11.719415 NaN
- SZ301678 6.6518 6.6518 12.799973 NaN
- SZ302132 1.3868 3.0296 15.359885 NaN
-
-[7601552 rows x 4 columns]
-[2715583:MainThread](2026-02-26 20:09:54,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.732s | Fetching dataframe Done
-[2715583:MainThread](2026-02-26 20:09:54,917) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
- where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
-[2715583:MainThread](2026-02-26 20:10:15,465) INFO - qlib.timer - [log.py:117] - Time cost: 20.556s | DDB query: Done
-[2715583:MainThread](2026-02-26 20:10:16,265) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2715583:MainThread](2026-02-26 20:10:16,775) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
- where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
-[2715583:MainThread](2026-02-26 20:10:36,740) INFO - qlib.timer - [log.py:117] - Time cost: 19.975s | DDB query: Done
-[2715583:MainThread](2026-02-26 20:10:37,558) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2715583:MainThread](2026-02-26 20:12:04,978) INFO - qlib.timer - [log.py:117] - Time cost: 84.148s | Instruments filter: Done
-[2715583:MainThread](2026-02-26 20:12:05,899) INFO - qlib.timer - [log.py:117] - Time cost: 130.996s | Loading data () Done
-[2715583:MainThread](2026-02-26 20:12:05,900) INFO - qlib.timer - [log.py:117] - Time cost: 130.997s | Init data () Done
-[2715583:MainThread](2026-02-26 20:12:05,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2715583:MainThread](2026-02-26 20:12:06,745) INFO - qlib.timer - [log.py:117] - Time cost: 0.842s | fetch_df_by_index Done
-[2715583:MainThread](2026-02-26 20:12:06,758) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
- IsZt IsDt IsN ... open_stop close_stop high_stop
-datetime instrument ...
-2019-12-03 SH600000 False False False ... False False False
- SH600004 False False False ... False False False
- SH600006 False False False ... False False False
- SH600007 False False False ... False False False
- SH600008 False False False ... False False False
-... ... ... ... ... ... ... ...
-2026-02-26 SZ301658 False False False ... False False False
- SZ301662 False False False ... False False False
- SZ301665 False False False ... False False False
- SZ301678 False False False ... False False False
- SZ302132 False False False ... False False False
-
-[6903684 rows x 12 columns]
-[2715583:MainThread](2026-02-26 20:12:06,759) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
-[2715583:MainThread](2026-02-26 20:12:06,777) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
- loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
- where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
-[2715583:MainThread](2026-02-26 20:12:08,840) INFO - qlib.timer - [log.py:117] - Time cost: 2.073s | DDB query: Done
-[2715583:MainThread](2026-02-26 20:12:08,849) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2715583:MainThread](2026-02-26 20:13:26,572) INFO - qlib.timer - [log.py:117] - Time cost: 77.719s | Instruments filter: Done
-[2715583:MainThread](2026-02-26 20:13:26,601) INFO - qlib.timer - [log.py:117] - Time cost: 79.839s | Loading data () Done
-[2715583:MainThread](2026-02-26 20:13:26,602) INFO - qlib.timer - [log.py:117] - Time cost: 79.840s | Init data () Done
-[2715583:MainThread](2026-02-26 20:13:26,603) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2715583:MainThread](2026-02-26 20:13:26,612) INFO - qlib.timer - [log.py:117] - Time cost: 0.008s | fetch_df_by_index Done
-[2715583:MainThread](2026-02-26 20:13:26,633) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
- gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
-datetime instrument ...
-2026-02-09 SH600000 False False ... False False
- SH600004 False False ... False False
- SH600006 False False ... False False
- SH600007 False False ... False False
- SH600008 False False ... False False
-... ... ... ... ... ...
-2026-02-26 SZ301658 False False ... False False
- SZ301662 False False ... False False
- SZ301665 False False ... False False
- SZ301678 False False ... False False
- SZ302132 False False ... False False
-
-[41168 rows x 30 columns]
-[2715583:MainThread](2026-02-26 20:13:26,634) INFO - qlib.timer - [log.py:117] - Time cost: 0.031s | Fetching dataframe Done
-[2715583:MainThread](2026-02-26 20:13:26,652) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
- where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
-[2715583:MainThread](2026-02-26 20:13:55,744) INFO - qlib.timer - [log.py:117] - Time cost: 29.102s | DDB query: Done
-[2715583:MainThread](2026-02-26 20:13:56,520) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2715583:MainThread](2026-02-26 20:15:27,625) INFO - qlib.timer - [log.py:117] - Time cost: 90.586s | Instruments filter: Done
-[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.621s | Loading data () Done
-[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.622s | Init data () Done
-[2715583:MainThread](2026-02-26 20:15:28,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2715583:MainThread](2026-02-26 20:15:28,867) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
-[2715583:MainThread](2026-02-26 20:15:28,875) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
- ST_Y ST_S ST_T ST_L ST_Z ST_X
-datetime instrument
-2019-12-03 SH600000 False False False False False False
- SH600004 False False False False False False
- SH600006 False False False False False False
- SH600007 False False False False False False
- SH600008 False False False False False False
-... ... ... ... ... ... ...
-2026-02-26 SZ301658 False False False False False False
- SZ301662 False False False False False False
- SZ301665 False False False False False False
- SZ301678 False False False False False False
- SZ302132 False False False False False False
-
-[6903687 rows x 6 columns]
-[2715583:MainThread](2026-02-26 20:15:28,876) INFO - qlib.timer - [log.py:117] - Time cost: 0.617s | Fetching dataframe Done
-/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
- group_list = [_df.resample("M", level="datetime")\
-Will use float32 for
-Will use float32 for
-Query config:
-#concepts: 2;
-Will use bool for
-Will use bool for
-Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
-Will use bool for
-Will use bool for
-[2715583:MainThread](2026-02-26 20:15:32,735) INFO - qlib.timer - [log.py:117] - Time cost: 3.858s | Concat index: Done
-[2715583:MainThread](2026-02-26 20:15:32,737) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
-[2715583:MainThread](2026-02-26 20:15:36,349) INFO - qlib.timer - [log.py:117] - Time cost: 3.611s | Creating SepDataFrame: Done
-[2715583:MainThread](2026-02-26 20:15:37,245) INFO - qlib.timer - [log.py:117] - Time cost: 1040.537s | Loading data () Done
-[2715583:MainThread](2026-02-26 20:15:37,246) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
-[]
-[2715583:MainThread](2026-02-26 20:15:37,248) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
-[2715583:MainThread](2026-02-26 20:15:37,265) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
-[]
-[2715583:MainThread](2026-02-26 20:15:37,266) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
-[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
-[]
-[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
-[2715583:MainThread](2026-02-26 20:15:37,293) INFO - qlib.timer - [log.py:117] - Time cost: 0.047s | fit & process data Done
-[2715583:MainThread](2026-02-26 20:15:37,294) INFO - qlib.timer - [log.py:117] - Time cost: 1040.587s | Init data () Done
-[2715583:MainThread](2026-02-26 20:15:37,963) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor
-[2715583:MainThread](2026-02-26 20:15:40,135) INFO - qlib.timer - [log.py:117] - Time cost: 2.171s | Diff Done
-[2715583:MainThread](2026-02-26 20:15:40,136) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor
-All processors are readonly
-All processors are readonly
-All processors are readonly
-Did load data from config: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml
-Did load norm from: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc
-Will assign `feature_ext` with
- turnover ... con_rating_strength_diff
-datetime instrument ...
-2026-02-09 SH600000 0.1837 ... 0.0
- SH600004 0.6948 ... 0.0
- SH600006 0.5542 ... 0.0
- SH600007 0.2057 ... 0.0
- SH600008 0.9809 ... 0.0
-... ... ... ...
-2026-02-26 SZ301658 6.0785 ... 0.0
- SZ301662 12.5950 ... 0.0
- SZ301665 14.0077 ... 0.0
- SZ301678 6.6518 ... 0.0
- SZ302132 1.3868 ... 0.0
-
-[41085 rows x 8 columns]
----
- ERROR: Failed to load data from Qlib pipeline: Cannot convert non-finite values (NA or inf) to integer
diff --git a/stock_1d/d033/alpha158_beta/scripts/run2.log b/stock_1d/d033/alpha158_beta/scripts/run2.log
deleted file mode 100644
index dd3e579..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/run2.log
+++ /dev/null
@@ -1,373 +0,0 @@
-[2730312:MainThread](2026-02-26 21:28:33,675) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
-[2730312:MainThread](2026-02-26 21:28:33,679) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
-[2730312:MainThread](2026-02-26 21:28:33,680) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
-================================================================================
-DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
-================================================================================
-Date Range: 2020-01-02 to 2020-01-10
-Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
-Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
-
-Step 1: Loading data from Qlib pipeline...
- Loading since_date=2020-01-02
- Loading raw data from handler.yaml...
-Will use `placehorder_value` from module: qlib.contrib.data.config
-Will init handler object from config:
-{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'end_time': datetime.date(2026, 2, 26),
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-12-13 00:00:00')},
- 'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'handler': {'class': 'AggHandler',
- 'kwargs': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'end_time': [2730312:MainThread](2026-02-26 21:28:33,704) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
-[2730312:MainThread](2026-02-26 21:28:33,704) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
-[2730312:MainThread](2026-02-26 21:28:34,011) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
-[2730312:MainThread](2026-02-26 21:36:00,317) INFO - qlib.timer - [log.py:117] - Time cost: 446.602s | DDB query: Done
-[2730312:MainThread](2026-02-26 21:36:01,106) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-datetime.date(2026, 2, 26),
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-12-13 00:00:00')},
- 'module_path': 'qlib.contrib.data.agg_handler'},
- 'load_end': datetime.date(2026, 2, 26),
- 'load_start': Timestamp('2019-12-13 00:00:00'),
- 'market': 'csiallx',
- 'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
- 'region': 'cn'}}
-Query config:
-#alpha158: 1;
-Will use float32 for
-[2730312:MainThread](2026-02-26 21:38:13,636) INFO - qlib.timer - [log.py:117] - Time cost: 123.423s | Instruments filter: Done
-[2730312:MainThread](2026-02-26 21:38:20,733) INFO - qlib.timer - [log.py:117] - Time cost: 587.024s | Loading data () Done
-[2730312:MainThread](2026-02-26 21:38:20,734) INFO - qlib.timer - [log.py:117] - Time cost: 587.026s | Init data () Done
-[2730312:MainThread](2026-02-26 21:38:20,736) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2730312:MainThread](2026-02-26 21:38:24,302) INFO - qlib.timer - [log.py:117] - Time cost: 3.564s | fetch_df_by_index Done
-[2730312:MainThread](2026-02-26 21:38:25,946) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
- KMID KLEN ... VSUMD30 VSUMD60
-datetime instrument ...
-2019-12-13 SH600000 0.011686 0.015025 ... -0.011573 0.039735
- SH600004 0.000000 0.009169 ... -0.146051 0.024757
- SH600006 -0.004329 0.015152 ... 0.136883 0.024626
- SH600007 0.005590 0.019005 ... -0.012912 0.017215
- SH600008 0.012270 0.012270 ... 0.039878 -0.013888
-... ... ... ... ... ...
-2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
- SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
- SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
- SZ301678 0.018182 0.027879 ... -0.054124 0.014202
- SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
-
-[6858048 rows x 158 columns]
-[2730312:MainThread](2026-02-26 21:38:25,947) INFO - qlib.timer - [log.py:117] - Time cost: 5.212s | Fetching dataframe Done
-[2730312:MainThread](2026-02-26 21:38:25,965) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
- where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
-[2730312:MainThread](2026-02-26 21:38:43,081) INFO - qlib.timer - [log.py:117] - Time cost: 17.127s | DDB query: Done
-[2730312:MainThread](2026-02-26 21:38:43,874) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2730312:MainThread](2026-02-26 21:38:44,458) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,con_rating_strength from
- loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
- where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
-[2730312:MainThread](2026-02-26 21:38:55,720) INFO - qlib.timer - [log.py:117] - Time cost: 11.271s | DDB query: Done
-[2730312:MainThread](2026-02-26 21:38:56,586) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
-[2730312:MainThread](2026-02-26 21:40:21,007) INFO - qlib.timer - [log.py:117] - Time cost: 81.315s | Instruments filter: Done
-[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.627s | Loading data () Done
-[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.628s | Init data () Done
-[2730312:MainThread](2026-02-26 21:40:21,577) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2730312:MainThread](2026-02-26 21:40:22,309) INFO - qlib.timer - [log.py:117] - Time cost: 0.731s | fetch_df_by_index Done
-[2730312:MainThread](2026-02-26 21:40:22,317) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
- turnover free_turnover log_size con_rating_strength
-datetime instrument
-2019-12-13 SH600000 0.2118 0.3879 17.343685 0.7143
- SH600004 0.7518 1.5357 15.099485 0.8214
- SH600006 0.7827 1.9762 13.732129 1.0000
- SH600007 0.1368 0.7071 14.409998 0.7500
- SH600008 0.2152 0.3990 14.444757 0.7500
-... ... ... ... ...
-2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
- SZ301662 12.5950 12.5950 12.681215 NaN
- SZ301665 14.0077 14.0077 11.719415 NaN
- SZ301678 6.6518 6.6518 12.799973 NaN
- SZ302132 1.3868 3.0296 15.359885 NaN
-
-[7572626 rows x 4 columns]
-[2730312:MainThread](2026-02-26 21:40:22,318) INFO - qlib.timer - [log.py:117] - Time cost: 0.741s | Fetching dataframe Done
-[2730312:MainThread](2026-02-26 21:40:22,334) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
- where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
-[2730312:MainThread](2026-02-26 21:40:43,075) INFO - qlib.timer - [log.py:117] - Time cost: 20.751s | DDB query: Done
-[2730312:MainThread](2026-02-26 21:40:43,889) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2730312:MainThread](2026-02-26 21:40:44,394) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
- where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
-[2730312:MainThread](2026-02-26 21:41:04,632) INFO - qlib.timer - [log.py:117] - Time cost: 20.246s | DDB query: Done
-[2730312:MainThread](2026-02-26 21:41:05,434) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2730312:MainThread](2026-02-26 21:42:33,029) INFO - qlib.timer - [log.py:117] - Time cost: 84.294s | Instruments filter: Done
-[2730312:MainThread](2026-02-26 21:42:34,049) INFO - qlib.timer - [log.py:117] - Time cost: 131.730s | Loading data () Done
-[2730312:MainThread](2026-02-26 21:42:34,050) INFO - qlib.timer - [log.py:117] - Time cost: 131.731s | Init data () Done
-[2730312:MainThread](2026-02-26 21:42:34,051) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2730312:MainThread](2026-02-26 21:42:34,895) INFO - qlib.timer - [log.py:117] - Time cost: 0.843s | fetch_df_by_index Done
-[2730312:MainThread](2026-02-26 21:42:34,907) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
- IsZt IsDt IsN ... open_stop close_stop high_stop
-datetime instrument ...
-2019-12-13 SH600000 False False False ... False False False
- SH600004 False False False ... False False False
- SH600006 False False False ... False False False
- SH600007 False False False ... False False False
- SH600008 False False False ... False False False
-... ... ... ... ... ... ... ...
-2026-02-26 SZ301658 False False False ... False False False
- SZ301662 False False False ... False False False
- SZ301665 False False False ... False False False
- SZ301678 False False False ... False False False
- SZ302132 False False False ... False False False
-
-[6874830 rows x 12 columns]
-[2730312:MainThread](2026-02-26 21:42:34,908) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
-[2730312:MainThread](2026-02-26 21:42:34,927) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
- loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
- where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
-[2730312:MainThread](2026-02-26 21:42:36,986) INFO - qlib.timer - [log.py:117] - Time cost: 2.069s | DDB query: Done
-[2730312:MainThread](2026-02-26 21:42:36,996) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2730312:MainThread](2026-02-26 21:43:53,198) INFO - qlib.timer - [log.py:117] - Time cost: 76.199s | Instruments filter: Done
-[2730312:MainThread](2026-02-26 21:43:53,230) INFO - qlib.timer - [log.py:117] - Time cost: 78.318s | Loading data () Done
-[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 78.319s | Init data () Done
-[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2730312:MainThread](2026-02-26 21:43:53,239) INFO - qlib.timer - [log.py:117] - Time cost: 0.007s | fetch_df_by_index Done
-[2730312:MainThread](2026-02-26 21:43:53,257) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
- gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
-datetime instrument ...
-2026-02-09 SH600000 False False ... False False
- SH600004 False False ... False False
- SH600006 False False ... False False
- SH600007 False False ... False False
- SH600008 False False ... False False
-... ... ... ... ... ...
-2026-02-26 SZ301658 False False ... False False
- SZ301662 False False ... False False
- SZ301665 False False ... False False
- SZ301678 False False ... False False
- SZ302132 False False ... False False
-
-[41168 rows x 30 columns]
-[2730312:MainThread](2026-02-26 21:43:53,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.027s | Fetching dataframe Done
-[2730312:MainThread](2026-02-26 21:43:53,274) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
- where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
-[2730312:MainThread](2026-02-26 21:44:44,876) INFO - qlib.timer - [log.py:117] - Time cost: 51.611s | DDB query: Done
-[2730312:MainThread](2026-02-26 21:44:45,602) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2730312:MainThread](2026-02-26 21:46:07,184) INFO - qlib.timer - [log.py:117] - Time cost: 81.056s | Instruments filter: Done
-[2730312:MainThread](2026-02-26 21:46:07,747) INFO - qlib.timer - [log.py:117] - Time cost: 134.487s | Loading data () Done
-[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 134.488s | Init data () Done
-[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2730312:MainThread](2026-02-26 21:46:08,349) INFO - qlib.timer - [log.py:117] - Time cost: 0.600s | fetch_df_by_index Done
-[2730312:MainThread](2026-02-26 21:46:08,358) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
- ST_Y ST_S ST_T ST_L ST_Z ST_X
-datetime instrument
-2019-12-13 SH600000 False False False False False False
- SH600004 False False False False False False
- SH600006 False False False False False False
- SH600007 False False False False False False
- SH600008 False False False False False False
-... ... ... ... ... ... ...
-2026-02-26 SZ301658 False False False False False False
- SZ301662 False False False False False False
- SZ301665 False False False False False False
- SZ301678 False False False False False False
- SZ302132 False False False False False False
-
-[6874833 rows x 6 columns]
-[2730312:MainThread](2026-02-26 21:46:08,359) INFO - qlib.timer - [log.py:117] - Time cost: 0.610s | Fetching dataframe Done
-/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
- group_list = [_df.resample("M", level="datetime")\
-Will use float32 for
-Will use float32 for
-Query config:
-#concepts: 2;
-Will use bool for
-Will use bool for
-Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
-Will use bool for
-Will use bool for
-[2730312:MainThread](2026-02-26 21:46:11,623) INFO - qlib.timer - [log.py:117] - Time cost: 3.264s | Concat index: Done
-[2730312:MainThread](2026-02-26 21:46:11,625) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
-[2730312:MainThread](2026-02-26 21:46:15,058) INFO - qlib.timer - [log.py:117] - Time cost: 3.433s | Creating SepDataFrame: Done
-[2730312:MainThread](2026-02-26 21:46:15,928) INFO - qlib.timer - [log.py:117] - Time cost: 1062.224s | Loading data () Done
-[2730312:MainThread](2026-02-26 21:46:15,929) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
-[]
-[2730312:MainThread](2026-02-26 21:46:15,931) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
-[2730312:MainThread](2026-02-26 21:46:15,935) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
-[]
-[2730312:MainThread](2026-02-26 21:46:15,936) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
-[2730312:MainThread](2026-02-26 21:46:15,939) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
-[]
-[2730312:MainThread](2026-02-26 21:46:15,940) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
-[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 0.014s | fit & process data Done
-[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 1062.239s | Init data () Done
-All processors are readonly
-All processors are readonly
-All processors are readonly
- ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'
diff --git a/stock_1d/d033/alpha158_beta/scripts/run3.log b/stock_1d/d033/alpha158_beta/scripts/run3.log
deleted file mode 100644
index 0745991..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/run3.log
+++ /dev/null
@@ -1,373 +0,0 @@
-[2734404:MainThread](2026-02-26 22:10:11,609) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
-[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
-[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
-================================================================================
-DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
-================================================================================
-Date Range: 2020-01-02 to 2020-01-10
-Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
-Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
-
-Step 1: Loading data from Qlib pipeline...
- Loading since_date=2020-01-02
- Loading data with handler (load_start=2019-12-13)...
-Will use `placehorder_value` from module: qlib.contrib.data.config
-Will init handler object from config:
-{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'end_time': datetime.date(2026, 2, 26),
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-11-23 00:00:00')},
- 'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'handler': {'class': 'AggHandler',
- 'kwargs': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- [2734404:MainThread](2026-02-26 22:10:11,634) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
-[2734404:MainThread](2026-02-26 22:10:11,634) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
-[2734404:MainThread](2026-02-26 22:10:11,842) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
-[2734404:MainThread](2026-02-26 22:17:41,432) INFO - qlib.timer - [log.py:117] - Time cost: 449.788s | DDB query: Done
-[2734404:MainThread](2026-02-26 22:17:42,271) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-'end_time': datetime.date(2026, 2, 26),
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-11-23 00:00:00')},
- 'module_path': 'qlib.contrib.data.agg_handler'},
- 'load_end': datetime.date(2026, 2, 26),
- 'load_start': Timestamp('2019-11-23 00:00:00'),
- 'market': 'csiallx',
- 'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
- 'region': 'cn'}}
-Query config:
-#alpha158: 1;
-Will use float32 for
-[2734404:MainThread](2026-02-26 22:19:46,550) INFO - qlib.timer - [log.py:117] - Time cost: 115.118s | Instruments filter: Done
-[2734404:MainThread](2026-02-26 22:19:53,556) INFO - qlib.timer - [log.py:117] - Time cost: 581.918s | Loading data () Done
-[2734404:MainThread](2026-02-26 22:19:53,557) INFO - qlib.timer - [log.py:117] - Time cost: 581.920s | Init data () Done
-[2734404:MainThread](2026-02-26 22:19:53,560) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2734404:MainThread](2026-02-26 22:19:57,060) INFO - qlib.timer - [log.py:117] - Time cost: 3.499s | fetch_df_by_index Done
-[2734404:MainThread](2026-02-26 22:19:58,834) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
- KMID KLEN ... VSUMD30 VSUMD60
-datetime instrument ...
-2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
- SH600004 -0.013806 0.030012 ... -0.017610 0.039195
- SH600006 0.009238 0.016166 ... -0.034782 -0.014306
- SH600007 -0.014749 0.018879 ... -0.032427 0.034279
- SH600008 0.009259 0.024691 ... -0.063490 0.003978
-... ... ... ... ... ...
-2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
- SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
- SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
- SZ301678 0.018182 0.027879 ... -0.054124 0.014202
- SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
-
-[6908346 rows x 158 columns]
-[2734404:MainThread](2026-02-26 22:19:58,835) INFO - qlib.timer - [log.py:117] - Time cost: 5.276s | Fetching dataframe Done
-[2734404:MainThread](2026-02-26 22:19:59,042) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
-[2734404:MainThread](2026-02-26 22:20:16,326) INFO - qlib.timer - [log.py:117] - Time cost: 17.485s | DDB query: Done
-[2734404:MainThread](2026-02-26 22:20:17,102) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2734404:MainThread](2026-02-26 22:20:17,676) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,con_rating_strength from
- loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
-[2734404:MainThread](2026-02-26 22:20:29,343) INFO - qlib.timer - [log.py:117] - Time cost: 11.676s | DDB query: Done
-[2734404:MainThread](2026-02-26 22:20:30,245) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
-[2734404:MainThread](2026-02-26 22:21:55,033) INFO - qlib.timer - [log.py:117] - Time cost: 81.592s | Instruments filter: Done
-[2734404:MainThread](2026-02-26 22:21:55,586) INFO - qlib.timer - [log.py:117] - Time cost: 116.751s | Loading data () Done
-[2734404:MainThread](2026-02-26 22:21:55,587) INFO - qlib.timer - [log.py:117] - Time cost: 116.752s | Init data () Done
-[2734404:MainThread](2026-02-26 22:21:55,588) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2734404:MainThread](2026-02-26 22:21:56,302) INFO - qlib.timer - [log.py:117] - Time cost: 0.713s | fetch_df_by_index Done
-[2734404:MainThread](2026-02-26 22:21:56,309) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
- turnover free_turnover log_size con_rating_strength
-datetime instrument
-2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
- SH600004 0.9386 1.9173 15.039255 0.8125
- SH600006 0.2566 0.6479 13.680836 1.0000
- SH600007 0.1647 0.8513 14.335590 0.7500
- SH600008 0.1813 0.3362 14.435625 0.6875
-... ... ... ... ...
-2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
- SZ301662 12.5950 12.5950 12.681215 NaN
- SZ301665 14.0077 14.0077 11.719415 NaN
- SZ301678 6.6518 6.6518 12.799973 NaN
- SZ302132 1.3868 3.0296 15.359885 NaN
-
-[7623242 rows x 4 columns]
-[2734404:MainThread](2026-02-26 22:21:56,310) INFO - qlib.timer - [log.py:117] - Time cost: 0.722s | Fetching dataframe Done
-[2734404:MainThread](2026-02-26 22:21:56,327) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
-[2734404:MainThread](2026-02-26 22:22:17,215) INFO - qlib.timer - [log.py:117] - Time cost: 20.899s | DDB query: Done
-[2734404:MainThread](2026-02-26 22:22:17,952) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2734404:MainThread](2026-02-26 22:22:18,463) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
-[2734404:MainThread](2026-02-26 22:22:38,963) INFO - qlib.timer - [log.py:117] - Time cost: 20.509s | DDB query: Done
-[2734404:MainThread](2026-02-26 22:22:39,774) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2734404:MainThread](2026-02-26 22:24:07,744) INFO - qlib.timer - [log.py:117] - Time cost: 84.654s | Instruments filter: Done
-[2734404:MainThread](2026-02-26 22:24:08,702) INFO - qlib.timer - [log.py:117] - Time cost: 132.391s | Loading data () Done
-[2734404:MainThread](2026-02-26 22:24:08,703) INFO - qlib.timer - [log.py:117] - Time cost: 132.392s | Init data () Done
-[2734404:MainThread](2026-02-26 22:24:08,704) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2734404:MainThread](2026-02-26 22:24:09,549) INFO - qlib.timer - [log.py:117] - Time cost: 0.844s | fetch_df_by_index Done
-[2734404:MainThread](2026-02-26 22:24:09,561) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
- IsZt IsDt IsN ... open_stop close_stop high_stop
-datetime instrument ...
-2019-11-25 SH600000 False False False ... False False False
- SH600004 False False False ... False False False
- SH600006 False False False ... False False False
- SH600007 False False False ... False False False
- SH600008 False False False ... False False False
-... ... ... ... ... ... ... ...
-2026-02-26 SZ301658 False False False ... False False False
- SZ301662 False False False ... False False False
- SZ301665 False False False ... False False False
- SZ301678 False False False ... False False False
- SZ302132 False False False ... False False False
-
-[6925320 rows x 12 columns]
-[2734404:MainThread](2026-02-26 22:24:09,562) INFO - qlib.timer - [log.py:117] - Time cost: 0.858s | Fetching dataframe Done
-[2734404:MainThread](2026-02-26 22:24:09,760) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
- loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
-[2734404:MainThread](2026-02-26 22:24:11,809) INFO - qlib.timer - [log.py:117] - Time cost: 2.238s | DDB query: Done
-[2734404:MainThread](2026-02-26 22:24:11,822) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2734404:MainThread](2026-02-26 22:25:28,259) INFO - qlib.timer - [log.py:117] - Time cost: 76.433s | Instruments filter: Done
-[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Loading data () Done
-[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Init data () Done
-[2734404:MainThread](2026-02-26 22:25:28,286) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2734404:MainThread](2026-02-26 22:25:28,290) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
-[2734404:MainThread](2026-02-26 22:25:28,310) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
- gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
-datetime instrument ...
-2026-02-09 SH600000 False False ... False False
- SH600004 False False ... False False
- SH600006 False False ... False False
- SH600007 False False ... False False
- SH600008 False False ... False False
-... ... ... ... ... ...
-2026-02-26 SZ301658 False False ... False False
- SZ301662 False False ... False False
- SZ301665 False False ... False False
- SZ301678 False False ... False False
- SZ302132 False False ... False False
-
-[41168 rows x 30 columns]
-[2734404:MainThread](2026-02-26 22:25:28,311) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
-[2734404:MainThread](2026-02-26 22:25:28,470) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
-[2734404:MainThread](2026-02-26 22:25:58,108) INFO - qlib.timer - [log.py:117] - Time cost: 29.791s | DDB query: Done
-[2734404:MainThread](2026-02-26 22:25:58,818) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2734404:MainThread](2026-02-26 22:27:21,291) INFO - qlib.timer - [log.py:117] - Time cost: 81.957s | Instruments filter: Done
-[2734404:MainThread](2026-02-26 22:27:21,828) INFO - qlib.timer - [log.py:117] - Time cost: 113.516s | Loading data () Done
-[2734404:MainThread](2026-02-26 22:27:21,829) INFO - qlib.timer - [log.py:117] - Time cost: 113.517s | Init data () Done
-[2734404:MainThread](2026-02-26 22:27:21,830) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2734404:MainThread](2026-02-26 22:27:22,439) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
-[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
- ST_Y ST_S ST_T ST_L ST_Z ST_X
-datetime instrument
-2019-11-25 SH600000 False False False False False False
- SH600004 False False False False False False
- SH600006 False False False False False False
- SH600007 False False False False False False
- SH600008 False False False False False False
-... ... ... ... ... ... ...
-2026-02-26 SZ301658 False False False False False False
- SZ301662 False False False False False False
- SZ301665 False False False False False False
- SZ301678 False False False False False False
- SZ302132 False False False False False False
-
-[6925323 rows x 6 columns]
-[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.timer - [log.py:117] - Time cost: 0.618s | Fetching dataframe Done
-/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
- group_list = [_df.resample("M", level="datetime")\
-Will use float32 for
-Will use float32 for
-Query config:
-#concepts: 2;
-Will use bool for
-Will use bool for
-Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
-Will use bool for
-Will use bool for
-[2734404:MainThread](2026-02-26 22:27:25,764) INFO - qlib.timer - [log.py:117] - Time cost: 3.315s | Concat index: Done
-[2734404:MainThread](2026-02-26 22:27:25,766) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
-[2734404:MainThread](2026-02-26 22:27:29,485) INFO - qlib.timer - [log.py:117] - Time cost: 3.718s | Creating SepDataFrame: Done
-[2734404:MainThread](2026-02-26 22:27:30,310) INFO - qlib.timer - [log.py:117] - Time cost: 1038.675s | Loading data () Done
-[2734404:MainThread](2026-02-26 22:27:30,311) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
-[]
-[2734404:MainThread](2026-02-26 22:27:30,313) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
-[2734404:MainThread](2026-02-26 22:27:30,318) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
-[]
-[2734404:MainThread](2026-02-26 22:27:30,319) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
-[2734404:MainThread](2026-02-26 22:27:30,322) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
-[]
-[2734404:MainThread](2026-02-26 22:27:30,323) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
-[2734404:MainThread](2026-02-26 22:27:30,326) INFO - qlib.timer - [log.py:117] - Time cost: 0.015s | fit & process data Done
-[2734404:MainThread](2026-02-26 22:27:30,327) INFO - qlib.timer - [log.py:117] - Time cost: 1038.692s | Init data () Done
-All processors are readonly
-All processors are readonly
-All processors are readonly
- ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'
diff --git a/stock_1d/d033/alpha158_beta/scripts/run4.log b/stock_1d/d033/alpha158_beta/scripts/run4.log
deleted file mode 100644
index 42eef2a..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/run4.log
+++ /dev/null
@@ -1,321 +0,0 @@
-[2739486:MainThread](2026-02-26 22:59:30,849) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
-[2739486:MainThread](2026-02-26 22:59:30,854) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
-[2739486:MainThread](2026-02-26 22:59:30,855) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
-================================================================================
-DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
-================================================================================
-Date Range: 2020-01-02 to 2020-01-10
-Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
-Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
-
-Step 1: Loading data from Qlib pipeline...
- Loading since_date=2020-01-02
- Loading data with handler (load_start=2019-12-13)...
-Will use `placehorder_value` from module: qlib.contrib.data.config
-Will init handler object from config:
-{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'end_time': datetime.date(2026, 2, 26),
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-11-23 00:00:00')},
- 'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'handler': {'class': 'AggHandler',
- 'kwargs': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- [2739486:MainThread](2026-02-26 22:59:30,878) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
-[2739486:MainThread](2026-02-26 22:59:30,878) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
-[2739486:MainThread](2026-02-26 22:59:30,938) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
-[2739486:MainThread](2026-02-26 23:07:16,353) INFO - qlib.timer - [log.py:117] - Time cost: 465.464s | DDB query: Done
-[2739486:MainThread](2026-02-26 23:07:17,149) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-'end_time': datetime.date(2026, 2, 26),
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-11-23 00:00:00')},
- 'module_path': 'qlib.contrib.data.agg_handler'},
- 'load_end': datetime.date(2026, 2, 26),
- 'load_start': Timestamp('2019-11-23 00:00:00'),
- 'market': 'csiallx',
- 'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
- 'region': 'cn'}}
-Query config:
-#alpha158: 1;
-Will use float32 for
-[2739486:MainThread](2026-02-26 23:09:19,001) INFO - qlib.timer - [log.py:117] - Time cost: 112.707s | Instruments filter: Done
-[2739486:MainThread](2026-02-26 23:09:26,016) INFO - qlib.timer - [log.py:117] - Time cost: 595.133s | Loading data () Done
-[2739486:MainThread](2026-02-26 23:09:26,017) INFO - qlib.timer - [log.py:117] - Time cost: 595.135s | Init data () Done
-[2739486:MainThread](2026-02-26 23:09:26,019) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2739486:MainThread](2026-02-26 23:09:29,432) INFO - qlib.timer - [log.py:117] - Time cost: 3.412s | fetch_df_by_index Done
-[2739486:MainThread](2026-02-26 23:09:31,228) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
- KMID KLEN ... VSUMD30 VSUMD60
-datetime instrument ...
-2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
- SH600004 -0.013806 0.030012 ... -0.017610 0.039195
- SH600006 0.009238 0.016166 ... -0.034782 -0.014306
- SH600007 -0.014749 0.018879 ... -0.032427 0.034279
- SH600008 0.009259 0.024691 ... -0.063490 0.003978
-... ... ... ... ... ...
-2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
- SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
- SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
- SZ301678 0.018182 0.027879 ... -0.054124 0.014202
- SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
-
-[6908346 rows x 158 columns]
-[2739486:MainThread](2026-02-26 23:09:31,229) INFO - qlib.timer - [log.py:117] - Time cost: 5.211s | Fetching dataframe Done
-[2739486:MainThread](2026-02-26 23:09:31,242) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
-[2739486:MainThread](2026-02-26 23:09:54,142) INFO - qlib.timer - [log.py:117] - Time cost: 22.909s | DDB query: Done
-[2739486:MainThread](2026-02-26 23:09:54,927) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2739486:MainThread](2026-02-26 23:09:55,507) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,con_rating_strength from
- loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
-[2739486:MainThread](2026-02-26 23:10:10,691) INFO - qlib.timer - [log.py:117] - Time cost: 15.192s | DDB query: Done
-[2739486:MainThread](2026-02-26 23:10:11,588) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2739486:MainThread](2026-02-26 23:11:37,528) INFO - qlib.timer - [log.py:117] - Time cost: 82.525s | Instruments filter: Done
-[2739486:MainThread](2026-02-26 23:11:38,259) INFO - qlib.timer - [log.py:117] - Time cost: 127.029s | Loading data () Done
-[2739486:MainThread](2026-02-26 23:11:38,260) INFO - qlib.timer - [log.py:117] - Time cost: 127.030s | Init data () Done
-[2739486:MainThread](2026-02-26 23:11:38,261) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2739486:MainThread](2026-02-26 23:11:39,000) INFO - qlib.timer - [log.py:117] - Time cost: 0.738s | fetch_df_by_index Done
-[2739486:MainThread](2026-02-26 23:11:39,009) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
- turnover free_turnover log_size con_rating_strength
-datetime instrument
-2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
- SH600004 0.9386 1.9173 15.039255 0.8125
- SH600006 0.2566 0.6479 13.680836 1.0000
- SH600007 0.1647 0.8513 14.335590 0.7500
- SH600008 0.1813 0.3362 14.435625 0.6875
-... ... ... ... ...
-2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
- SZ301662 12.5950 12.5950 12.681215 1.0000
- SZ301665 14.0077 14.0077 11.719415 1.0000
- SZ301678 6.6518 6.6518 12.799973 0.7500
- SZ302132 1.3868 3.0296 15.359885 0.8750
-
-[7623255 rows x 4 columns]
-[2739486:MainThread](2026-02-26 23:11:39,010) INFO - qlib.timer - [log.py:117] - Time cost: 0.749s | Fetching dataframe Done
-[2739486:MainThread](2026-02-26 23:11:39,191) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
-[2739486:MainThread](2026-02-26 23:12:05,839) INFO - qlib.timer - [log.py:117] - Time cost: 26.825s | DDB query: Done
-[2739486:MainThread](2026-02-26 23:12:06,554) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2739486:MainThread](2026-02-26 23:12:07,075) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
-[2739486:MainThread](2026-02-26 23:12:32,695) INFO - qlib.timer - [log.py:117] - Time cost: 25.629s | DDB query: Done
-[2739486:MainThread](2026-02-26 23:12:33,566) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2739486:MainThread](2026-02-26 23:14:02,232) INFO - qlib.timer - [log.py:117] - Time cost: 85.158s | Instruments filter: Done
-[2739486:MainThread](2026-02-26 23:14:03,155) INFO - qlib.timer - [log.py:117] - Time cost: 144.143s | Loading data () Done
-[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 144.144s | Init data () Done
-[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2739486:MainThread](2026-02-26 23:14:04,046) INFO - qlib.timer - [log.py:117] - Time cost: 0.889s | fetch_df_by_index Done
-[2739486:MainThread](2026-02-26 23:14:04,060) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
- IsZt IsDt IsN ... open_stop close_stop high_stop
-datetime instrument ...
-2019-11-25 SH600000 False False False ... False False False
- SH600004 False False False ... False False False
- SH600006 False False False ... False False False
- SH600007 False False False ... False False False
- SH600008 False False False ... False False False
-... ... ... ... ... ... ... ...
-2026-02-26 SZ301658 False False False ... False False False
- SZ301662 False False False ... False False False
- SZ301665 False False False ... False False False
- SZ301678 False False False ... False False False
- SZ302132 False False False ... False False False
-
-[6925320 rows x 12 columns]
-[2739486:MainThread](2026-02-26 23:14:04,061) INFO - qlib.timer - [log.py:117] - Time cost: 0.904s | Fetching dataframe Done
-[2739486:MainThread](2026-02-26 23:14:04,079) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
- loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
-[2739486:MainThread](2026-02-26 23:14:06,440) INFO - qlib.timer - [log.py:117] - Time cost: 2.370s | DDB query: Done
-[2739486:MainThread](2026-02-26 23:14:06,448) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
-[2739486:MainThread](2026-02-26 23:15:23,146) INFO - qlib.timer - [log.py:117] - Time cost: 76.695s | Instruments filter: Done
-[2739486:MainThread](2026-02-26 23:15:23,184) INFO - qlib.timer - [log.py:117] - Time cost: 79.120s | Loading data () Done
-[2739486:MainThread](2026-02-26 23:15:23,185) INFO - qlib.timer - [log.py:117] - Time cost: 79.121s | Init data () Done
-[2739486:MainThread](2026-02-26 23:15:23,186) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
-[2739486:MainThread](2026-02-26 23:15:23,190) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
-[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
- gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
-datetime instrument ...
-2026-02-09 SH600000 False False ... False False
- SH600004 False False ... False False
- SH600006 False False ... False False
- SH600007 False False ... False False
- SH600008 False False ... False False
-... ... ... ... ... ...
-2026-02-26 SZ301658 False False ... False False
- SZ301662 False False ... False False
- SZ301665 False False ... False False
- SZ301678 False False ... False False
- SZ302132 False False ... False False
-
-[41168 rows x 30 columns]
-[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
-[2739486:MainThread](2026-02-26 23:15:23,226) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
- loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
- where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
-[2739486:MainThread](2026-02-26 23:15:53,388) INFO - qlib.timer - [log.py:117] - Time cost: 30.171s | DDB query: Done
-[2739486:MainThread](2026-02-26 23:15:54,166) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
diff --git a/stock_1d/d033/alpha158_beta/scripts/run_simple.log b/stock_1d/d033/alpha158_beta/scripts/run_simple.log
deleted file mode 100644
index c72ec48..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/run_simple.log
+++ /dev/null
@@ -1,104 +0,0 @@
-[2745445:MainThread](2026-02-26 23:18:06,410) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
-[2745445:MainThread](2026-02-26 23:18:06,414) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
-[2745445:MainThread](2026-02-26 23:18:06,415) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
-================================================================================
-DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
-================================================================================
-Date Range: 2020-01-02 to 2020-01-10
-Output Directory: ../data/
-Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
-
-Step 1: Loading raw data from Qlib pipeline...
- Loading raw data from handler (load_start=2019-12-13)...
-Will use `placehorder_value` from module: qlib.contrib.data.config
-Will init handler object from config:
-{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'end_time': datetime.date(2026, 2, 26),
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-11-23 00:00:00')},
- 'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'handler': {'class': 'AggHandler',
- 'kwargs': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'end_time': datetime.date(2026, 2, 26),
- 'handler_list': [{[2745445:MainThread](2026-02-26 23:18:06,436) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
-[2745445:MainThread](2026-02-26 23:18:06,437) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
-[2745445:MainThread](2026-02-26 23:18:06,492) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
diff --git a/stock_1d/d033/alpha158_beta/scripts/run_simple2.log b/stock_1d/d033/alpha158_beta/scripts/run_simple2.log
deleted file mode 100644
index 8a868fe..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/run_simple2.log
+++ /dev/null
@@ -1,103 +0,0 @@
-[2746177:MainThread](2026-02-26 23:21:56,618) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
-[2746177:MainThread](2026-02-26 23:21:56,622) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
-[2746177:MainThread](2026-02-26 23:21:56,623) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
-================================================================================
-DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
-================================================================================
-Date Range: 2020-01-02 to 2020-01-10
-Output Directory: ../data/
-Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
-
-Step 1: Loading raw data from Qlib pipeline...
- Loading raw data from handler (load_start=2019-12-13)...
- Filtering instruments: ['SH600000', 'SH600004', 'SH600006', 'SH600007', 'SH600008']... (5 total)
-Will use `placehorder_value` from module: qlib.contrib.data.config
-Will init handler object from config:
-{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'end_time': datetime.date(2026, 2, 26),
- 'handler_list': [{'class': 'DDBAlpha158Handler',
- 'kwargs': {'col_set': 'feature',
- 'query_config': [{'alpha158_config': 'alpha158_expr.csv',
- 'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': 'alpha158',
- 'table_name': 'stg_1day_wind_alpha158_0_7'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
- {'class': 'DDBMarketExtHandler',
- 'kwargs': {'col_set': 'feature_ext',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['Turnover '
- 'as '
- 'turnover',
- 'FreeTurnover '
- 'as '
- 'free_turnover',
- 'log(MarketValue) '
- 'as '
- 'log_size'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'float32',
- 'field_list': ['con_rating_strength'],
- 'table_name': 'stg_1day_gds_con_rating'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
- {'class': 'DDBMarketFlagHandler',
- 'kwargs': {'col_set': 'feature_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['IsZt',
- 'IsDt',
- 'IsN',
- 'IsXD',
- 'IsXR',
- 'IsDR'],
- 'table_name': 'stg_1day_wind_kline_adjusted'},
- {'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['open_limit',
- 'close_limit',
- 'low_limit',
- 'open_stop',
- 'close_stop',
- 'high_stop'],
- 'table_name': 'stg_1day_wind_market_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
- {'class': 'DDBIndusFlagHandler',
- 'kwargs': {'col_set': 'indus_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': 'industry_code_cc.csv',
- 'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
- {'class': 'DDBStFlagHandler',
- 'kwargs': {'col_set': 'st_flag',
- 'query_config': [{'db_path': 'dfs://daily_stock_run',
- 'dtype': 'bool',
- 'field_list': ['ST_Y',
- 'ST_S',
- 'ST_T',
- 'ST_L',
- 'ST_Z',
- 'ST_X'],
- 'table_name': 'stg_1day_wind_st_flag'}]},
- 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
- 'instruments': 'csiallx',
- 'start_time': Timestamp('2019-11-23 00:00:00')},
- 'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'},
- 'handler': {'class': 'AggHandler',
- 'kwargs': {'ddb_config': {'host': '192.168.1.146',
- 'password': '123456',
- 'port': 8848,
- 'username': 'admin'}[2746177:MainThread](2026-02-26 23:21:56,647) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
-[2746177:MainThread](2026-02-26 23:21:56,648) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
-[2746177:MainThread](2026-02-26 23:21:56,716) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
-
- use mytt;
- select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
diff --git a/stock_1d/d033/alpha158_beta/scripts/verify_feature_order.py b/stock_1d/d033/alpha158_beta/scripts/verify_feature_order.py
deleted file mode 100644
index d08a7b0..0000000
--- a/stock_1d/d033/alpha158_beta/scripts/verify_feature_order.py
+++ /dev/null
@@ -1,187 +0,0 @@
-#!/usr/bin/env python
-"""
-Verify feature column order between standalone pipeline and qlib gold standard.
-
-This script:
-1. Loads a small sample using the qlib pipeline
-2. Runs the same sample through the standalone generate_beta_embedding pipeline
-3. Compares the column order and feature values
-"""
-
-import pickle as pkl
-import ruamel.yaml as yaml
-import pandas as pd
-import polars as pl
-import numpy as np
-import sys
-import os
-
-# Patch yaml.safe_load for compatibility
-_yaml = yaml.YAML(typ='safe', pure=True)
-def patched_safe_load(stream):
- import io
- if isinstance(stream, str):
- stream = io.StringIO(stream)
- return _yaml.load(stream)
-yaml.safe_load = patched_safe_load
-
-# Add scripts directory to path
-sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'scripts'))
-
-def main():
- print("=" * 70)
- print("VERIFY FEATURE ORDER: Standalone vs Qlib Gold Standard")
- print("=" * 70)
-
- # Step 1: Load processor list
- print("\nStep 1: Loading processor list...")
- proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
- with open(proc_path, "rb") as f:
- proc_list = pkl.load(f)
- print(f" Loaded {len(proc_list)} processors")
-
- # Step 2: Load small sample from qlib pipeline
- print("\nStep 2: Loading sample from qlib pipeline...")
-
- import qlib
- from qlib.config import REG_CN
- qlib.init(provider_uri='/home/guofu/.qlib/data_ops/target', region=REG_CN)
-
- from qlib.workflow.cli import sys_config
- from qlib.utils import fill_placeholder
- import datetime as dt
-
- yaml_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml"
- with open(yaml_path) as fin:
- config = yaml.safe_load(fin)
-
- sys_config(config, "qlib.contrib.data.config")
- qlib.init(**config.get("qlib_init"))
-
- load_start = pd.to_datetime("2020-01-02") - dt.timedelta(days=20)
- placehorder_value = {
- "": load_start,
- "": dt.date.today()
- }
-
- config_filled = fill_placeholder(config, placehorder_value)
- handler = qlib.init_instance_by_config(config_filled["handler"])
- handler_data = handler._data
-
- # Get data from SepDataFrame
- if hasattr(handler_data, '_data'):
- df_dict = handler_data._data
- print(f" Handler groups: {list(df_dict.keys())}")
-
- # Concatenate groups
- raw_dfs = []
- for group, df in df_dict.items():
- df_copy = df.copy()
- df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
- raw_dfs.append(df_copy)
- print(f" {group}: {len(df_copy.columns)} columns")
-
- raw_df = pd.concat(raw_dfs, axis=1)
- print(f" Raw concatenated shape: {raw_df.shape}")
-
- # Step 3: Apply processors to get gold standard features
- print("\nStep 3: Applying processors (qlib gold standard)...")
- from qlib.contrib.data.utils import apply_proc_list
-
- # Strip group prefixes for processor application
- col_mapping = {col: col.split('::', 1)[1] for col in raw_df.columns if '::' in col}
- raw_df_stripped = raw_df.rename(columns=col_mapping)
-
- # Convert bool to object for processor compatibility
- bool_cols = raw_df_stripped.select_dtypes(include=['bool']).columns
- for col in bool_cols:
- raw_df_stripped[col] = raw_df_stripped[col].astype(object)
-
- df_gold = apply_proc_list(raw_df_stripped, proc_list=proc_list, with_fit=False)
- print(f" Gold standard shape after processors: {df_gold.shape}")
-
- # Restore group prefixes
- reverse_mapping = {v: k for k, v in col_mapping.items()}
- df_gold = df_gold.rename(columns=reverse_mapping)
-
- # Get gold standard column order
- gold_columns = list(df_gold.columns)
- print(f"\nGold standard column groups:")
-
- feature_cols = [c for c in gold_columns if c.startswith('feature::')]
- feature_ext_cols = [c for c in gold_columns if c.startswith('feature_ext::')]
- feature_flag_cols = [c for c in gold_columns if c.startswith('feature_flag::')]
- indus_idx_cols = [c for c in gold_columns if c.startswith('indus_idx::')]
-
- print(f" feature:: {len(feature_cols)} cols")
- print(f" feature_ext:: {len(feature_ext_cols)} cols")
- print(f" feature_flag:: {len(feature_flag_cols)} cols")
- print(f" indus_idx:: {len(indus_idx_cols)} cols")
-
- # Step 4: Now run standalone pipeline on same data
- print("\nStep 4: Running standalone pipeline...")
-
- # Load parquet data for same date range
- from generate_beta_embedding import load_all_data, merge_data_sources, apply_feature_pipeline
-
- df_alpha, df_kline, df_flag, df_industry = load_all_data("2020-01-02", "2020-01-10")
- df_standalone = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
-
- print(f" Standalone loaded shape: {df_standalone.shape}")
-
- # Apply feature pipeline
- df_processed, feature_cols_standalone = apply_feature_pipeline(df_standalone)
- print(f" Standalone processed shape: {df_processed.shape}")
- print(f" Standalone feature columns: {len(feature_cols_standalone)}")
-
- # Step 5: Compare column counts
- print("\n" + "=" * 70)
- print("COMPARISON SUMMARY")
- print("=" * 70)
-
- print(f"\nGold standard total columns: {len(gold_columns)}")
- print(f" feature:: {len(feature_cols)}")
- print(f" feature_ext:: {len(feature_ext_cols)}")
- print(f" feature_flag:: {len(feature_flag_cols)}")
- print(f" indus_idx:: {len(indus_idx_cols)}")
-
- print(f"\nStandalone feature columns: {len(feature_cols_standalone)}")
-
- # The gold standard columns (without prefix) should match standalone
- gold_feature_cols = [c.split('::', 1)[1] for c in feature_cols]
- gold_feature_ext_cols = [c.split('::', 1)[1] for c in feature_ext_cols]
- gold_feature_flag_cols = [c.split('::', 1)[1] for c in feature_flag_cols]
- gold_indus_idx_cols = [c.split('::', 1)[1] for c in indus_idx_cols]
-
- gold_all = gold_feature_cols + gold_feature_ext_cols + gold_feature_flag_cols + gold_indus_idx_cols
-
- print(f"\nGold standard (flat): {len(gold_all)} features")
- print(f"Standalone: {len(feature_cols_standalone)} features")
-
- if len(gold_all) != len(feature_cols_standalone):
- print(f"\nWARNING: Feature count mismatch! Difference: {len(gold_all) - len(feature_cols_standalone)}")
-
- # Check column order
- print("\nFirst 20 column comparison:")
- print(f"{'Idx':<5} {'Gold Standard':<40} {'Standalone':<40} {'Match':<6}")
- print("-" * 90)
- for i in range(min(20, len(gold_all), len(feature_cols_standalone))):
- match = "✓" if gold_all[i] == feature_cols_standalone[i] else "✗"
- print(f"{i:<5} {gold_all[i]:<40} {feature_cols_standalone[i]:<40} {match:<6}")
-
- # Check if orders match
- if gold_all == feature_cols_standalone:
- print("\n✓ Column order MATCHES!")
- else:
- print("\n✗ Column order DOES NOT MATCH!")
- print("\nFinding differences...")
- diff_count = 0
- for i in range(min(len(gold_all), len(feature_cols_standalone))):
- if gold_all[i] != feature_cols_standalone[i]:
- diff_count += 1
- if diff_count <= 20:
- print(f" [{i}] Gold: {gold_all[i]} vs Standalone: {feature_cols_standalone[i]}")
- print(f"Total differences: {diff_count}")
-
-if __name__ == "__main__":
- main()