Clean obsolete debug files from alpha158_beta

Remove bug analysis documentation (findings incorporated into README.md):
- BUG_ANALYSIS.md, BUG_ANALYSIS_FINAL.md

Remove one-off debug/exploration scripts:
- compare_gold_standard.py, debug_data_divergence.py
- verify_feature_order.py, regenerate_sample_embedding.py
- dump_qlib_gold_standard.py, dump_qlib_gold_standard_simple.py

Remove temporary log files and empty __pycache__ directories

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
master
guofu 2 days ago
parent ea011090f8
commit 26a694298d

18
.gitignore vendored

@ -31,9 +31,21 @@ wheels/
*.ipynb_checkpoints
# Results and data
results/*
!results/*/.gitkeep
!results/*/README.md
cta_1d/results/*
!cta_1d/results/.gitkeep
!cta_1d/results/README.md
!cta_1d/results/*/.gitkeep
!cta_1d/results/*/README.md
stock_15m/results/*
!stock_15m/results/.gitkeep
!stock_15m/results/README.md
!stock_15m/results/*/.gitkeep
!stock_15m/results/*/README.md
stock_1d/results/*
!stock_1d/results/.gitkeep
!stock_1d/results/README.md
!stock_1d/results/*/.gitkeep
!stock_1d/results/*/README.md
*.parquet
*.pkl
*.h5

@ -1,293 +0,0 @@
#!/usr/bin/env python3
"""
Compare generated embeddings with database embeddings (0_7 version).
Handles format conversion for datetime and instrument columns.
SUMMARY OF FINDINGS:
- Generated embeddings and database embeddings have DIFFERENT values
- Instrument mapping: 430xxx -> SHxxxxx, 830xxx -> SZxxxxx, 6xxxxx -> SH6xxxxx
- Correlation between corresponding dimensions: ~0.0067 (essentially zero)
- The generated embeddings are NOT the same as the database 0_7 embeddings
- Possible reasons:
1. Different model weights/versions used for generation
2. Different input features or normalization
3. Different random seed or inference configuration
"""
import polars as pl
import numpy as np
from pathlib import Path
def instrument_int_to_code(inst_int: int) -> str:
"""Convert integer instrument code to exchange-prefixed string.
The encoding in the embedding file uses:
- 4xxxxx -> SHxxxxxx (Shanghai A-shares, but code mapping is non-trivial)
- 8xxxxx -> SZxxxxxx (Shenzhen A-shares)
- Direct 6-digit codes are also present (600xxx, 000xxx, 300xxx)
Note: The exact mapping from 430017 -> SH600021 requires the original
features file. We attempt an approximate mapping here.
"""
inst_str = str(inst_int)
# Already 6-digit code
if len(inst_str) == 6 and inst_str[0] not in ('4', '8'):
if inst_str.startswith('6'):
return f"SH{inst_str}"
else:
return f"SZ{inst_str}"
# 6-digit with exchange prefix (4=SH, 8=SZ)
if len(inst_str) == 6 and inst_str[0] in ('4', '8'):
exchange = 'SH' if inst_str[0] == '4' else 'SZ'
# The mapping from 430xxx -> 600xxx is not 1:1
# Return the code as-is for matching attempts
return f"{exchange}{inst_str[1:]}"
return inst_str
def load_generated_embedding(date_int: int, sample_n: int = None):
"""Load generated embedding for a specific date."""
gen_path = Path('/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/embedding_0_7_beta.parquet')
lf = pl.scan_parquet(gen_path)
lf = lf.filter(pl.col('datetime') == date_int)
if sample_n:
lf = lf.head(sample_n)
df = lf.collect()
# Convert wide format (embedding_0, embedding_1, ...) to list format
embedding_cols = [c for c in df.columns if c.startswith('embedding_')]
embedding_cols.sort(key=lambda x: int(x.split('_')[1]))
embedding_structs = df.select(embedding_cols).to_struct()
embeddings_list = [[v for v in struct.values()] for struct in embedding_structs]
df = df.with_columns([
pl.Series('values', embeddings_list),
pl.col('datetime').cast(pl.UInt32).alias('datetime_uint32'),
pl.col('instrument').alias('instrument_orig'),
pl.col('instrument').cast(pl.String).alias('instrument_str'),
pl.col('instrument').map_elements(instrument_int_to_code, return_dtype=pl.String).alias('instrument_code')
])
return df
def load_database_embedding(date_str: str):
"""Load database embedding for a specific date."""
db_path = Path(f'/data/parquet/dataset/dwm_1day_multicast_csencode_1D/version=csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/datetime={date_str}/0.parquet')
if not db_path.exists():
return None
df = pl.read_parquet(db_path)
df = df.with_columns([
pl.col('datetime').cast(pl.Int64).alias('datetime_int')
])
return df
def analyze_instrument_mapping(date_int: int):
"""Analyze the instrument mapping between generated and database embeddings."""
date_str = str(date_int)
print(f"\n{'='*80}")
print(f"Analyzing instrument mapping for date: {date_int}")
print(f"{'='*80}")
gen_df = load_generated_embedding(date_int)
db_df = load_database_embedding(date_str)
if db_df is None:
print(f"ERROR: Database embedding not found for {date_str}")
return
print(f"\nGenerated embeddings: {gen_df.shape[0]} rows")
print(f"Database embeddings: {db_df.shape[0]} rows")
# Show samples
print("\n--- Generated Embedding Sample ---")
sample_gen = gen_df.select(['datetime', 'instrument_orig', 'instrument_str', 'instrument_code', 'values']).head(10)
print(sample_gen)
print("\n--- Database Embedding Sample ---")
print(db_df.head(10))
# Try different matching strategies
gen_insts_set = set(gen_df['instrument_code'].to_list())
db_insts_set = set(db_df['instrument'].to_list())
common = gen_insts_set & db_insts_set
gen_only = gen_insts_set - db_insts_set
db_only = db_insts_set - gen_insts_set
print(f"\n--- Matching Results (with code conversion) ---")
print(f"Common instruments: {len(common)}")
print(f"Generated only: {len(gen_only)}")
print(f"Database only: {len(db_only)}")
if len(common) == 0:
print("\nNo common instruments found with code conversion!")
print("\nTrying to find mapping patterns...")
# Show some samples for analysis
print("\nGenerated instrument samples (original, converted):")
gen_samples = list(zip(gen_df['instrument_orig'].head(20).to_list(),
gen_df['instrument_code'].head(20).to_list()))
for orig, conv in gen_samples:
print(f" {orig} -> {conv}")
print("\nDatabase instrument samples:")
db_samples = db_df['instrument'].head(20).to_list()
for inst in db_samples:
print(f" {inst}")
# Check if there's a position-based alignment possible
# Sort both and compare by position
gen_sorted = sorted(gen_df['instrument_orig'].to_list())
db_sorted = sorted([int(inst[2:]) for inst in db_df['instrument'].to_list()])
print("\n--- Attempting position-based matching ---")
print(f"Generated sorted (first 10): {gen_sorted[:10]}")
print(f"Database sorted (first 10): {db_sorted[:10]}")
else:
# We have matches, compare embeddings
print(f"\n--- Comparing embeddings for {len(common)} common instruments ---")
gen_common = gen_df.filter(pl.col('instrument_code').is_in(list(common)))
db_common = db_df.filter(pl.col('instrument').is_in(list(common)))
# Join and compare
comparison = gen_common.join(
db_common,
left_on='instrument_code',
right_on='instrument',
how='inner',
suffix='_db'
)
# Calculate differences
diffs = []
for row in comparison.iter_rows():
# Find indices for the values columns
gen_vals_idx = comparison.columns.index('values')
db_vals_idx = comparison.columns.index('values_db')
gen_emb = np.array(row[gen_vals_idx])
db_emb = np.array(row[db_vals_idx])
diff = gen_emb - db_emb
diff_norm = np.linalg.norm(diff)
rel_diff = diff_norm / (np.linalg.norm(db_emb) + 1e-10)
diffs.append({
'instrument': row[comparison.columns.index('instrument_code')],
'l2_norm_diff': diff_norm,
'relative_diff': rel_diff,
'max_abs_diff': np.max(np.abs(diff)),
'gen_emb_norm': np.linalg.norm(gen_emb),
'db_emb_norm': np.linalg.norm(db_emb)
})
if diffs:
diff_df = pl.DataFrame(diffs)
print("\nDifference statistics:")
print(diff_df.select(['l2_norm_diff', 'relative_diff', 'max_abs_diff']).describe())
max_rel_diff = diff_df['relative_diff'].max()
print(f"\nMax relative difference: {max_rel_diff:.6e}")
if max_rel_diff < 1e-5:
print("✓ Embeddings match within numerical precision!")
elif max_rel_diff < 0.01:
print("~ Embeddings are very similar")
else:
print("✗ Embeddings differ significantly")
# Show some comparison samples
print("\nSample comparison:")
for i in range(min(5, len(diffs))):
d = diffs[i]
print(f" {d['instrument']}: gen_norm={d['gen_emb_norm']:.4f}, "
f"db_norm={d['db_emb_norm']:.4f}, rel_diff={d['relative_diff']:.6e}")
def calculate_correlation(date_int: int):
"""Calculate correlation between generated and database embeddings."""
import numpy as np
date_str = str(date_int)
print(f"\n{'='*80}")
print(f"Correlation Analysis for date: {date_int}")
print(f"{'='*80}")
gen_df = load_generated_embedding(date_int)
db_df = load_database_embedding(date_str)
if db_df is None:
print(f"ERROR: Database embedding not found for {date_str}")
return
# Find common instruments
gen_insts = set(gen_df['instrument_code'].to_list())
db_insts = set(db_df['instrument'].to_list())
common = list(gen_insts & db_insts)
print(f"\nCommon instruments: {len(common)}")
if len(common) == 0:
print("No common instruments found!")
return
# Filter to common and sort
gen_common = gen_df.filter(pl.col('instrument_code').is_in(common)).sort('instrument_code')
db_common = db_df.filter(pl.col('instrument').is_in(common)).sort('instrument')
# Extract embedding matrices
gen_embs = np.array(gen_common['values'].to_list())
db_embs = np.array(db_common['values'].to_list())
print(f"Generated embeddings shape: {gen_embs.shape}")
print(f"Database embeddings shape: {db_embs.shape}")
# Calculate correlation per dimension
correlations = []
for i in range(32):
gen_dim = gen_embs[:, i]
db_dim = db_embs[:, i]
corr = np.corrcoef(gen_dim, db_dim)[0, 1]
correlations.append(corr)
print(f"\nCorrelation statistics across 32 dimensions:")
print(f" Mean: {np.mean(correlations):.4f}")
print(f" Median: {np.median(correlations):.4f}")
print(f" Min: {np.min(correlations):.4f}")
print(f" Max: {np.max(correlations):.4f}")
# Overall correlation
overall_corr = np.corrcoef(gen_embs.flatten(), db_embs.flatten())[0, 1]
print(f"\nOverall correlation (all dims flattened): {overall_corr:.4f}")
# Interpretation
mean_corr = np.mean(correlations)
if abs(mean_corr) < 0.1:
print("\n✗ CONCLUSION: Embeddings are NOT correlated (essentially independent)")
elif abs(mean_corr) < 0.5:
print("\n~ CONCLUSION: Weak correlation between embeddings")
else:
print(f"\n✓ CONCLUSION: {'Strong' if abs(mean_corr) > 0.8 else 'Moderate'} correlation")
if __name__ == '__main__':
# Analyze for a few dates
dates_to_compare = [20190102, 20200102, 20240102]
for date in dates_to_compare:
try:
analyze_instrument_mapping(date)
calculate_correlation(date)
except Exception as e:
print(f"\nError analyzing date {date}: {e}")
import traceback
traceback.print_exc()

@ -0,0 +1,18 @@
# CTA 1D Experiment Results
Document experiments manually here.
## Template
```markdown
## YYYY-MM-DD: Experiment Name
- Notebook: `../cta_1d/XX_notebook.ipynb` (cell range)
- Data: [dates]
- Config: key parameters
- Metrics: IC mean/std, returns, sharpe
- Notes: observations, next steps
```
## Experiments
*Add entries below as you run experiments*

@ -0,0 +1,18 @@
# Stock 15m Experiment Results
Document experiments manually here.
## Template
```markdown
## YYYY-MM-DD: Experiment Name
- Notebook: `../stock_15m/XX_notebook.ipynb` (cell range)
- Data: [dates]
- Config: key parameters
- Metrics: IC mean/std, returns, sharpe
- Notes: observations, next steps
```
## Experiments
*Add entries below as you run experiments*

@ -1,123 +0,0 @@
# Data Pipeline Bug Analysis
## Summary
The generated embeddings do not match the database 0_7 embeddings due to multiple bugs in the data pipeline migration from qlib to standalone Polars implementation.
---
## Bugs Fixed
### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
**Original (incorrect):**
```python
market_0 = (instrument >= 600000) # SH
market_1 = (instrument < 600000) # SZ
```
**Fixed:**
```python
inst_str = str(instrument).zfill(6)
market_0 = inst_str.startswith('6') # SH: 6xxxxx
market_1 = inst_str.startswith('0') | inst_str.startswith('3') # SZ: 0xxx, 3xxx
market_2 = inst_str.startswith('4') | inst_str.startswith('8') # NE: 4xxx, 8xxx
```
**Impact:** 167 instruments (4xxxxx, 8xxxxx - 新三板) were misclassified.
---
### 2. ColumnRemover Missing `IsN` ✓ FIXED
**Original (incorrect):**
```python
columns_to_remove = ['TotalValue_diff', 'IsZt', 'IsDt']
```
**Fixed:**
```python
columns_to_remove = ['TotalValue_diff', 'IsN', 'IsZt', 'IsDt']
```
**Impact:** Extra column caused feature dimension mismatch.
---
### 3. RobustZScoreNorm Applied to Wrong Columns ✓ FIXED
**Original (incorrect):**
Applied normalization to ALL 341 features including market flags and indus_idx.
**Fixed:**
Only normalize `alpha158 + alpha158_ntrl + market_ext + market_ext_ntrl` (330 features), excluding:
- Market flags (Limit, Stopping, IsTp, IsXD, IsXR, IsDR, market_0, market_1, market_2, IsST)
- indus_idx
---
## Critical Remaining Issue: Data Schema Mismatch
### `Limit` and `Stopping` Column Types Changed
**Original qlib pipeline expected:**
- `Limit`: **Boolean** flag (True = limit up)
- `Stopping`: **Boolean** flag (True = suspended trading)
**Current Parquet data has:**
- `Limit`: **Float64** price change percentage (0.0 to 1301.3)
- `Stopping`: **Float64** price change percentage
**Evidence:**
```
Limit values sample: [8.86, 9.36, 31.0, 7.32, 2.28, 6.39, 5.38, 4.03, 3.86, 9.89]
Limit == 0: only 2 rows
Limit > 0: 3738 rows
```
This is a **fundamental data schema change**. The current Parquet files contain different data than what the original VAE model was trained on.
**Possible fixes:**
1. Convert `Limit` and `Stopping` to boolean flags using a threshold
2. Find the original data source that had boolean flags
3. Re-train the VAE model with the new data schema
---
## Correlation Results
After fixing bugs 1-3, the embedding correlation with database 0_7:
| Metric | Value |
|--------|-------|
| Mean correlation (32 dims) | 0.0068 |
| Median correlation | 0.0094 |
| Overall correlation | 0.2330 |
**Conclusion:** Embeddings remain essentially uncorrelated (≈0).
---
## Root Cause
The **Limit/Stopping data schema change** is the most likely root cause. The VAE model learned to encode features that included binary limit/stopping flags, but the standalone pipeline feeds it continuous price change percentages instead.
---
## Next Steps
1. **Verify original data schema:**
- Check if the original DolphinDB table had boolean `Limit` and `Stopping` columns
- Compare with the current Parquet schema
2. **Fix the data loading:**
- Either convert continuous values to binary flags
- Or use the correct boolean columns (`IsZt`, `IsDt`) for limit flags
3. **Verify feature order:**
- Ensure the qlib RobustZScoreNorm parameters are applied in the correct order
- Check that `[alpha158, alpha158_ntrl, market_ext, market_ext_ntrl]` matches the 330-parameter shape
4. **Re-run comparison:**
- Generate new embeddings with the corrected pipeline
- Compare correlation with database

@ -1,159 +0,0 @@
# Data Pipeline Bug Analysis - Final Status
## Summary
After fixing all identified bugs, the feature count now matches (341), but the embeddings remain uncorrelated with the database 0_7 version.
**Latest Version**: v6
- Feature count: 341 ✓ (matches VAE input dim)
- Mean correlation with DB: 0.0050 (essentially zero)
- Status: All identified bugs fixed, IsST issue documented
- **New**: Polars-based dataset generation script added (`scripts/dump_polars_dataset.py`)
---
## Bugs Fixed
### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
- **Bug**: Used `instrument >= 600000` which misclassified 新三板 instruments
- **Fix**: Use string prefix matching with vocab_size=2 (not 3)
- **Impact**: 167 instruments corrected
### 2. ColumnRemover Missing `IsN` ✓ FIXED
- **Bug**: Only removed `IsZt, IsDt` but not `IsN`
- **Fix**: Added `IsN` to removal list
- **Impact**: Feature count alignment
### 3. RobustZScoreNorm Scope ✓ FIXED
- **Bug**: Applied normalization to all 341 features
- **Fix**: Only normalize 330 features (alpha158 + market_ext, both original + neutralized)
- **Impact**: Correct normalization scope
### 4. Wrong Data Sources for Market Flags ✓ FIXED
- **Bug**: Used `Limit, Stopping` (Float64) from kline_adjusted
- **Fix**: Load from correct sources:
- kline_adjusted: `IsZt, IsDt, IsN, IsXD, IsXR, IsDR` (Boolean)
- market_flag: `open_limit, close_limit, low_limit, high_stop` (Boolean, 4 cols)
- **Impact**: Correct boolean flag data
### 5. Feature Count Mismatch ✓ FIXED
- **Bug**: 344 features (3 extra)
- **Fix**: vocab_size=2 + 4 market_flag cols = 341 features
- **Impact**: VAE input dimension matches
### 6. Fixed* Processors Not Adding Required Columns ✓ FIXED
- **Bug**: `FixedFlagMarketInjector` only converted dtype but didn't add `market_0`, `market_1` columns
- **Bug**: `FixedFlagSTInjector` only converted dtype but didn't create `IsST` column from `ST_S`, `ST_Y`
- **Fix**:
- `FixedFlagMarketInjector`: Now adds `market_0` (SH60xxx, SZ00xxx) and `market_1` (SH688xxx, SH689xxx, SZ300xxx, SZ301xxx)
- `FixedFlagSTInjector`: Now creates `IsST = ST_S | ST_Y`
- **Impact**: Processed data now has 408 columns (was 405), matching original qlib output
---
## Important Discovery: IsST Column Issue in Gold-Standard Code
### Problem Description
The `FlagSTInjector` processor in the original qlib proc_list is supposed to create an `IsST` column in the `feature_flag` group from the `ST_S` and `ST_Y` columns in the `st_flag` group. However, this processor **fails silently** even in the gold-standard qlib code.
### Root Cause
The `FlagSTInjector` processor attempts to access columns using a format that doesn't match the actual column structure in the data:
1. **Expected format**: The processor expects columns like `st_flag::ST_S` and `st_flag::ST_Y` (string format with `::` separator)
2. **Actual format**: The qlib handler produces MultiIndex tuple columns like `('st_flag', 'ST_S')` and `('st_flag', 'ST_Y')`
This format mismatch causes the processor to fail to find the ST flag columns, and thus no `IsST` column is created.
### Evidence
```python
# Check proc_list
import pickle as pkl
with open('proc_list.proc', 'rb') as f:
proc_list = pkl.load(f)
# FlagSTInjector config
flag_st = proc_list[2]
print(f"fields_group: {flag_st.fields_group}") # 'feature_flag'
print(f"col_name: {flag_st.col_name}") # 'IsST'
print(f"st_group: {flag_st.st_group}") # 'st_flag'
# Check if IsST exists in processed data
with open('processed_data.pkl', 'rb') as f:
df = pkl.load(f)
feature_flag_cols = [c[1] for c in df.columns if c[0] == 'feature_flag']
print('IsST' in feature_flag_cols) # False!
```
### Impact
- **VAE training**: The VAE model was trained on data **without** the `IsST` column
- **VAE input dimension**: 341 features (excluding IsST), not 342
- **Polars pipeline**: Should also skip `IsST` to maintain compatibility
### Resolution
The polars-based pipeline (`dump_polars_dataset.py`) now correctly **skips** the `FlagSTInjector` step to match the gold-standard behavior:
```python
# Step 3: FlagSTInjector - SKIPPED (fails even in gold-standard)
print("[3] Skipping FlagSTInjector (as per gold-standard behavior)...")
market_flag_with_st = market_flag_with_market # No IsST added
```
### Lessons Learned
1. **Verify processor execution**: Don't assume all processors in the proc_list executed successfully. Check the output data to verify expected columns exist.
2. **Column format matters**: The qlib processors were designed for specific column formats (MultiIndex tuples vs `::` separator strings). Format mismatches can cause silent failures.
3. **Match the gold-standard bugs**: When replicating a pipeline, sometimes you need to replicate the bugs too. The VAE was trained on data without `IsST`, so our pipeline must also exclude it.
4. **Debug by comparing intermediate outputs**: Use scripts like `debug_data_divergence.py` to compare raw and processed data between the gold-standard and polars pipelines.
---
## Correlation Results (v5)
| Metric | Value |
|--------|-------|
| Mean correlation (32 dims) | 0.0050 |
| Median correlation | 0.0079 |
| Min | -0.0420 |
| Max | 0.0372 |
| Overall (flattened) | 0.2225 |
**Conclusion**: Embeddings remain essentially uncorrelated with database.
---
## Possible Remaining Issues
1. **Different input data values**: The alpha158_0_7_beta Parquet files may contain different values than the original DolphinDB data used to train the VAE.
2. **Feature ordering mismatch**: The 330 RobustZScoreNorm parameters must be applied in the exact order:
- [0:158] = alpha158 original
- [158:316] = alpha158_ntrl
- [316:323] = market_ext original (7 cols)
- [323:330] = market_ext_ntrl (7 cols)
3. **Industry neutralization differences**: Our `IndusNtrlInjector` implementation may differ from qlib's.
4. **Missing transformations**: There may be additional preprocessing steps not captured in handler.yaml.
5. **VAE model mismatch**: The VAE model may have been trained with different data than what handler.yaml specifies.
---
## Recommended Next Steps
1. **Compare intermediate features**: Run both the qlib pipeline and our pipeline on the same input data and compare outputs at each step.
2. **Verify RobustZScoreNorm parameter order**: Check if our feature ordering matches the order used during VAE training.
3. **Compare predictions, not embeddings**: Instead of comparing VAE embeddings, compare the final d033 model predictions with the original 0_7 predictions.
4. **Check alpha158 data source**: Verify that `stg_1day_wind_alpha158_0_7_beta_1D` contains the same data as the original DolphinDB `stg_1day_wind_alpha158_0_7_beta` table.

@ -1,129 +0,0 @@
#!/usr/bin/env python
"""
Compare generated embeddings with gold standard embeddings from DolphinDB.
"""
import polars as pl
import numpy as np
from pathlib import Path
DATA_DIR = Path(__file__).parent / "../data"
def compare_embeddings():
"""Compare generated and gold standard embeddings."""
# Load data
gold_path = DATA_DIR / "embedding_0_7_beta_gold_standard.parquet"
gen_path = DATA_DIR / "embedding_0_7_beta_sample.parquet"
print("=" * 60)
print("Loading embeddings")
print("=" * 60)
gold = pl.read_parquet(gold_path)
gen = pl.read_parquet(gen_path)
print(f"Gold standard: {gold.shape}")
print(f"Generated: {gen.shape}")
# Get embedding columns
emb_cols = [f"embedding_{i}" for i in range(32)]
# Compare by date
dates = sorted(gold["datetime"].unique().to_list())
print("\n" + "=" * 60)
print("Comparison by date")
print("=" * 60)
for dt in dates:
gold_dt = gold.filter(pl.col("datetime") == dt)
gen_dt = gen.filter(pl.col("datetime") == dt)
print(f"\nDate: {dt}")
print(f" Gold instruments: {gold_dt.height}, Generated instruments: {gen_dt.height}")
print(f" Gold instrument sample: {gold_dt['instrument'].head(5).to_list()}")
print(f" Gen instrument sample: {gen_dt['instrument'].head(5).to_list()}")
# Check for common instruments
gold_insts = set(gold_dt["instrument"].to_list())
gen_insts = set(gen_dt["instrument"].to_list())
common = gold_insts & gen_insts
print(f" Common instruments: {len(common)}")
if len(common) > 0:
# Compare embeddings for common instruments
gold_common = gold_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
gen_common = gen_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
# Calculate embedding differences
diffs = []
for i in range(len(gold_common)):
gold_emb = np.array([gold_common[col][i] for col in emb_cols])
gen_emb = np.array([gen_common[col][i] for col in emb_cols])
diff = gold_emb - gen_emb
l2_norm = np.linalg.norm(diff)
rel_diff = l2_norm / (np.linalg.norm(gold_emb) + 1e-8)
max_abs_diff = np.max(np.abs(diff))
diffs.append({
"l2_norm": l2_norm,
"rel_diff": rel_diff,
"max_abs_diff": max_abs_diff,
"gold_norm": np.linalg.norm(gold_emb),
"gen_norm": np.linalg.norm(gen_emb)
})
diff_df = pl.DataFrame(diffs)
print(f"\n Embedding comparison:")
print(f" Mean L2 norm diff: {diff_df['l2_norm'].mean():.4f}")
print(f" Mean rel diff: {diff_df['rel_diff'].mean():.4%}")
print(f" Mean max abs diff: {diff_df['max_abs_diff'].mean():.4f}")
print(f" Gold emb norm (mean): {diff_df['gold_norm'].mean():.4f}")
print(f" Gen emb norm (mean): {diff_df['gen_norm'].mean():.4f}")
# Correlation analysis
gold_embs = np.array([[gold_common[col][i] for col in emb_cols] for i in range(len(gold_common))])
gen_embs = np.array([[gen_common[col][i] for col in emb_cols] for i in range(len(gen_common))])
correlations = []
for d in range(32):
corr = np.corrcoef(gold_embs[:, d], gen_embs[:, d])[0, 1]
correlations.append(corr)
print(f"\n Correlation by dimension:")
print(f" Mean: {np.mean(correlations):.4f}")
print(f" Median: {np.median(correlations):.4f}")
print(f" Min: {np.min(correlations):.4f}")
print(f" Max: {np.max(correlations):.4f}")
# Overall correlation
overall_corr = np.corrcoef(gold_embs.flatten(), gen_embs.flatten())[0, 1]
print(f" Overall (flattened): {overall_corr:.4f}")
print("\n" + "=" * 60)
print("Summary Statistics")
print("=" * 60)
# Gold standard stats
gold_embs = gold.select(emb_cols).to_numpy()
print("\nGold standard embeddings:")
print(f" Mean: {np.mean(gold_embs):.6f}")
print(f" Std: {np.std(gold_embs):.6f}")
print(f" Min: {np.min(gold_embs):.6f}")
print(f" Max: {np.max(gold_embs):.6f}")
# Generated stats
gen_embs = gen.select(emb_cols).to_numpy()
print("\nGenerated embeddings:")
print(f" Mean: {np.mean(gen_embs):.6f}")
print(f" Std: {np.std(gen_embs):.6f}")
print(f" Min: {np.min(gen_embs):.6f}")
print(f" Max: {np.max(gen_embs):.6f}")
if __name__ == "__main__":
compare_embeddings()

@ -1,254 +0,0 @@
#!/usr/bin/env python
"""
Debug script to compare gold-standard qlib data vs polars-based pipeline.
This script helps identify where the data loading and processing pipeline
starts to diverge from the gold-standard qlib output.
"""
import os
import sys
import pickle as pkl
import numpy as np
import pandas as pd
import polars as pl
from pathlib import Path
# Paths
GOLD_RAW_PATH = "/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/raw_data_20190101_20190131.pkl"
GOLD_PROC_PATH = "/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/processed_data_20190101_20190131.pkl"
PROC_LIST_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
def compare_raw_data():
"""Compare raw data from gold standard vs polars pipeline."""
print("=" * 80)
print("STEP 1: Compare RAW DATA (before proc_list)")
print("=" * 80)
# Load gold standard raw data
with open(GOLD_RAW_PATH, "rb") as f:
gold_raw = pkl.load(f)
print(f"\nGold standard raw data:")
print(f" Shape: {gold_raw.shape}")
print(f" Index: {gold_raw.index.names}")
print(f" Column groups: {gold_raw.columns.get_level_values(0).unique().tolist()}")
# Count columns per group
for grp in gold_raw.columns.get_level_values(0).unique().tolist():
count = (gold_raw.columns.get_level_values(0) == grp).sum()
print(f" {grp}: {count} columns")
# Show sample values for key columns
print("\n Sample values (first 3 rows):")
for col in [('feature', 'KMID'), ('feature_ext', 'turnover'), ('feature_ext', 'log_size')]:
if col in gold_raw.columns:
print(f" {col}: {gold_raw[col].iloc[:3].tolist()}")
return gold_raw
def compare_processed_data():
"""Compare processed data from gold standard vs polars pipeline."""
print("\n" + "=" * 80)
print("STEP 2: Compare PROCESSED DATA (after proc_list)")
print("=" * 80)
# Load gold standard processed data
with open(GOLD_PROC_PATH, "rb") as f:
gold_proc = pkl.load(f)
print(f"\nGold standard processed data:")
print(f" Shape: {gold_proc.shape}")
print(f" Index: {gold_proc.index.names}")
print(f" Column groups: {gold_proc.columns.get_level_values(0).unique().tolist()}")
# Count columns per group
for grp in gold_proc.columns.get_level_values(0).unique().tolist():
count = (gold_proc.columns.get_level_values(0) == grp).sum()
print(f" {grp}: {count} columns")
# Show sample values for key columns
print("\n Sample values (first 3 rows):")
for col in [('feature', 'KMID'), ('feature', 'KMID_ntrl'),
('feature_ext', 'turnover'), ('feature_ext', 'turnover_ntrl')]:
if col in gold_proc.columns:
print(f" {col}: {gold_proc[col].iloc[:3].tolist()}")
return gold_proc
def analyze_processor_pipeline(gold_raw, gold_proc):
"""Analyze what transformations happened in the proc_list."""
print("\n" + "=" * 80)
print("STEP 3: Analyze Processor Transformations")
print("=" * 80)
# Load proc_list
with open(PROC_LIST_PATH, "rb") as f:
proc_list = pkl.load(f)
print(f"\nProcessor pipeline ({len(proc_list)} processors):")
for i, proc in enumerate(proc_list):
print(f" [{i}] {type(proc).__name__}")
# Analyze column changes
print("\nColumn count changes:")
print(f" Before: {gold_raw.shape[1]} columns")
print(f" After: {gold_proc.shape[1]} columns")
print(f" Change: +{gold_proc.shape[1] - gold_raw.shape[1]} columns")
# Check which columns were added/removed
gold_raw_cols = set(gold_raw.columns)
gold_proc_cols = set(gold_proc.columns)
added_cols = gold_proc_cols - gold_raw_cols
removed_cols = gold_raw_cols - gold_proc_cols
print(f"\n Added columns: {len(added_cols)}")
print(f" Removed columns: {len(removed_cols)}")
if removed_cols:
print(f" Removed: {list(removed_cols)[:10]}...")
# Check feature column patterns
print("\nFeature column patterns in processed data:")
feature_cols = [c for c in gold_proc.columns if c[0] == 'feature']
ntrl_cols = [c for c in feature_cols if c[1].endswith('_ntrl')]
raw_cols = [c for c in feature_cols if not c[1].endswith('_ntrl')]
print(f" Total feature columns: {len(feature_cols)}")
print(f" _ntrl columns: {len(ntrl_cols)}")
print(f" raw columns: {len(raw_cols)}")
def check_polars_pipeline():
"""Run the polars-based pipeline and compare."""
print("\n" + "=" * 80)
print("STEP 4: Generate data using Polars pipeline")
print("=" * 80)
try:
from generate_beta_embedding import (
load_all_data, merge_data_sources, apply_feature_pipeline,
filter_stock_universe
)
# Load data using polars pipeline
print("\nLoading data with polars pipeline...")
df_alpha, df_kline, df_flag, df_industry = load_all_data(
"2019-01-01", "2019-01-31"
)
print(f"\nPolars data sources loaded:")
print(f" Alpha158: {df_alpha.shape}")
print(f" Kline (market_ext): {df_kline.shape}")
print(f" Flags: {df_flag.shape}")
print(f" Industry: {df_industry.shape}")
# Merge
df_merged = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
print(f"\nAfter merge: {df_merged.shape}")
# Convert to pandas for easier comparison
df_pandas = df_merged.to_pandas()
df_pandas = df_pandas.set_index(['datetime', 'instrument'])
print(f"\nAfter converting to pandas MultiIndex: {df_pandas.shape}")
# Compare column names
with open(GOLD_RAW_PATH, "rb") as f:
gold_raw = pkl.load(f)
print("\n" + "=" * 80)
print("STEP 5: Compare Column Names (Gold vs Polars)")
print("=" * 80)
gold_cols = set(str(c) for c in gold_raw.columns)
polars_cols = set(str(c) for c in df_pandas.columns)
common_cols = gold_cols & polars_cols
only_in_gold = gold_cols - polars_cols
only_in_polars = polars_cols - gold_cols
print(f"\n Common columns: {len(common_cols)}")
print(f" Only in gold standard: {len(only_in_gold)}")
print(f" Only in polars: {len(only_in_polars)}")
if only_in_gold:
print(f"\n Columns only in gold standard (first 20):")
for col in list(only_in_gold)[:20]:
print(f" {col}")
if only_in_polars:
print(f"\n Columns only in polars (first 20):")
for col in list(only_in_polars)[:20]:
print(f" {col}")
# Check common columns values
print("\n" + "=" * 80)
print("STEP 6: Compare Values for Common Columns")
print("=" * 80)
# Get common columns as tuples
common_tuples = []
for gc in gold_raw.columns:
gc_str = str(gc)
for pc in df_pandas.columns:
if str(pc) == gc_str:
common_tuples.append((gc, pc))
break
print(f"\nComparing {len(common_tuples)} common columns...")
# Compare first few columns
matching_count = 0
diff_count = 0
for i, (gc, pc) in enumerate(common_tuples[:20]):
gold_vals = gold_raw[gc].dropna().values
polars_vals = df_pandas[pc].dropna().values
if len(gold_vals) > 0 and len(polars_vals) > 0:
# Compare min, max, mean
if np.allclose([gold_vals.min(), gold_vals.max(), gold_vals.mean()],
[polars_vals.min(), polars_vals.max(), polars_vals.mean()],
rtol=1e-5):
matching_count += 1
else:
diff_count += 1
if diff_count <= 3:
print(f" DIFF: {gc}")
print(f" Gold: min={gold_vals.min():.6f}, max={gold_vals.max():.6f}, mean={gold_vals.mean():.6f}")
print(f" Polars: min={polars_vals.min():.6f}, max={polars_vals.max():.6f}, mean={polars_vals.mean():.6f}")
print(f"\n Matching columns: {matching_count}")
print(f" Different columns: {diff_count}")
except Exception as e:
print(f"\nError running polars pipeline: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
print("=" * 80)
print("DATA DIVERGENCE DEBUG SCRIPT")
print("Comparing gold-standard qlib output vs polars-based pipeline")
print("=" * 80)
# Step 1: Check raw data
gold_raw = compare_raw_data()
# Step 2: Check processed data
gold_proc = compare_processed_data()
# Step 3: Analyze processor transformations
analyze_processor_pipeline(gold_raw, gold_proc)
# Step 4 & 5: Run polars pipeline and compare
check_polars_pipeline()
print("\n" + "=" * 80)
print("DEBUG COMPLETE")
print("=" * 80)

@ -1,421 +0,0 @@
#!/usr/bin/env python
"""
Dump Gold-Standard Data from Qlib Pipeline
This script exports processed feature data from the original Qlib pipeline
in multiple formats for debugging and comparison with the standalone Polars implementation.
Usage:
python dump_qlib_gold_standard.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir ../data/
"""
import argparse
import os
import sys
import pickle as pkl
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import polars as pl
import numpy as np
# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
if not hasattr(np, 'NaN'):
np.NaN = np.nan
def parse_args():
parser = argparse.ArgumentParser(
description="Dump gold-standard data from Qlib pipeline"
)
parser.add_argument(
"--start-date",
type=str,
default="2020-01-02",
help="Start date for data export (YYYY-MM-DD)",
)
parser.add_argument(
"--end-date",
type=str,
default="2020-01-10",
help="End date for data export (YYYY-MM-DD)",
)
parser.add_argument(
"--output-dir",
type=str,
default="../data/",
help="Output directory for exported files",
)
parser.add_argument(
"--qlib-dataset-path",
type=str,
default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
help="Path to Qlib dataset module",
)
return parser.parse_args()
def load_qlib_data(qlib_dataset_path, since_date):
"""
Load processed data from Qlib pipeline.
This function loads data using the original Qlib pipeline and handles
the SepDataFrame return type by concatenating column groups.
Args:
qlib_dataset_path: Path to the Qlib dataset module
since_date: Start date for loading data (YYYY-MM-DD)
Returns:
pd.DataFrame: Processed DataFrame from Qlib pipeline with all column groups concatenated
"""
import importlib.util
import datetime as dt
# Patch ruamel.yaml to provide safe_load compatibility
import ruamel.yaml as yaml
# Create a YAML instance with safe loader for backward compatibility
_yaml = yaml.YAML(typ='safe', pure=True)
# Monkey-patch safe_load to use the new API
def patched_safe_load(stream):
import io
if isinstance(stream, str):
stream = io.StringIO(stream)
return _yaml.load(stream)
yaml.safe_load = patched_safe_load
# Load the module directly
spec = importlib.util.spec_from_file_location(
"qlib_dataset",
os.path.join(qlib_dataset_path, "__init__.py")
)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Parse since_date
since_date_dt = pd.to_datetime(since_date)
# Load with extra history for Diff processor
load_start = (since_date_dt - dt.timedelta(days=20)).strftime("%Y-%m-%d")
print(f" Loading data with handler (load_start={load_start})...")
# Use _load_from_yaml to get raw handler data (SepDataFrame)
handler_data = module._load_from_yaml(
os.path.join(qlib_dataset_path, "handler.yaml"),
load_start
)
# Handle SepDataFrame - extract and concatenate column groups
if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
# It's a SepDataFrame from AggHandler
df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
group_names = list(df_dict.keys())
print(f" Handler returned SepDataFrame with groups: {group_names}")
# Concatenate all column groups into a single DataFrame
all_dfs = []
for group in group_names:
df = df_dict[group]
if df is not None and len(df.columns) > 0:
df_copy = df.copy()
# Add group prefix to columns
df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
all_dfs.append(df_copy)
print(f" Group '{group}': {df_copy.shape}")
# Concatenate all groups along axis 1
raw_df = pd.concat(all_dfs, axis=1)
print(f" Concatenated raw data shape: {raw_df.shape}")
else:
raw_df = handler_data
print(f" Raw data shape: {raw_df.shape}")
# Load processor list
proc_path = os.path.join(qlib_dataset_path, "proc_list.proc")
print(f" Loading processor list from: {proc_path}")
with open(proc_path, "rb") as f:
proc_list = pkl.load(f)
print(f" Processor list has {len(proc_list)} processors")
for i, proc in enumerate(proc_list):
print(f" {i+1}. {type(proc).__name__}")
# Apply processors
from qlib.contrib.data.utils import apply_proc_list
print(f" Applying processor list (with_fit=False)...")
# The processor list expects columns without the group prefix
# We need to strip the prefix before applying processors
# Create a mapping and restore original column names
col_mapping = {}
for col in raw_df.columns:
if '::' in col:
original = col.split('::', 1)[1]
col_mapping[col] = original
# Rename columns back to original names for processor application
raw_df_renamed = raw_df.rename(columns=col_mapping)
print(f" Renamed columns for processor compatibility. Shape: {raw_df_renamed.shape}")
# Convert boolean columns to object to avoid NaN -> int conversion issues
bool_cols = raw_df_renamed.select_dtypes(include=['bool']).columns
print(f" Converting {len(bool_cols)} boolean columns to object dtype")
for col in bool_cols:
raw_df_renamed[col] = raw_df_renamed[col].astype(object)
# Apply processors
df = apply_proc_list(raw_df_renamed, proc_list=proc_list, with_fit=False)
print(f" Applied processor list. Result shape: {df.shape}")
# Add back group prefixes to columns
new_col_mapping = {v: k for k, v in col_mapping.items()}
df = df.rename(columns=new_col_mapping)
print(f" Restored column group prefixes. Shape: {df.shape}")
# Filter to requested date range
df = df.loc(axis=0)[slice(since_date_dt, None)]
print(f" Filtered to since_date={since_date}. Final shape: {df.shape}")
return df
def export_column_groups(df, output_dir, prefix="gold_standard"):
"""
Export separate files for different column groups.
Column groups:
- feature: alpha158 + alpha158_ntrl
- feature_ext: extended features (log_size_diff, etc.)
- feature_flag: market flags (IsST, IsN, IsZt, IsDt, etc.)
- indus_idx: industry index columns
"""
# Identify column groups based on naming conventions
feature_cols = [c for c in df.columns if c.startswith("feature::")]
feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
# Also include the ntrl suffixed columns
feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
export_paths = {}
# Export feature columns (alpha158 + alpha158_ntrl)
if feature_cols:
feature_df = df[feature_cols]
path = os.path.join(output_dir, f"{prefix}_feature.parquet")
feature_df.to_parquet(path)
export_paths["feature"] = path
print(f" Exported feature columns ({len(feature_cols)}): {path}")
# Export feature_ext columns
if feature_ext_cols:
feature_ext_df = df[feature_ext_cols]
path = os.path.join(output_dir, f"{prefix}_feature_ext.parquet")
feature_ext_df.to_parquet(path)
export_paths["feature_ext"] = path
print(f" Exported feature_ext columns ({len(feature_ext_cols)}): {path}")
# Export feature_flag columns
if feature_flag_cols:
feature_flag_df = df[feature_flag_cols]
path = os.path.join(output_dir, f"{prefix}_feature_flag.parquet")
feature_flag_df.to_parquet(path)
export_paths["feature_flag"] = path
print(f" Exported feature_flag columns ({len(feature_flag_cols)}): {path}")
# Export indus_idx columns
if indus_idx_cols:
indus_idx_df = df[indus_idx_cols]
path = os.path.join(output_dir, f"{prefix}_indus_idx.parquet")
indus_idx_df.to_parquet(path)
export_paths["indus_idx"] = path
print(f" Exported indus_idx columns ({len(indus_idx_cols)}): {path}")
# Export feature_ntrl columns separately
if feature_ntrl_cols:
feature_ntrl_df = df[feature_ntrl_cols]
path = os.path.join(output_dir, f"{prefix}_feature_ntrl.parquet")
feature_ntrl_df.to_parquet(path)
export_paths["feature_ntrl"] = path
print(f" Exported feature_ntrl columns ({len(feature_ntrl_cols)}): {path}")
return export_paths
def export_metadata(df, output_dir, prefix="gold_standard", proc_list_path=None):
"""
Export metadata about the dataset.
Includes:
- Column names and shapes
- Processor list configuration
- Date range coverage
- NaN value statistics
"""
metadata_path = os.path.join(output_dir, f"{prefix}_metadata.txt")
with open(metadata_path, "w") as f:
f.write("=" * 80 + "\n")
f.write("GOLD-STANDARD QLIB PIPELINE OUTPUT - METADATA\n")
f.write("=" * 80 + "\n\n")
f.write(f"Export Date: {datetime.now().isoformat()}\n\n")
f.write("DATAFRAME SHAPE\n")
f.write("-" * 40 + "\n")
f.write(f"Shape: {df.shape}\n")
f.write(f"Rows: {len(df)}\n")
f.write(f"Columns: {len(df.columns)}\n\n")
f.write("DATE RANGE\n")
f.write("-" * 40 + "\n")
dates = df.index.get_level_values("datetime").unique()
f.write(f"Min Date: {dates.min()}\n")
f.write(f"Max Date: {dates.max()}\n")
f.write(f"Unique Dates: {len(dates)}\n\n")
f.write("INSTRUMENTS\n")
f.write("-" * 40 + "\n")
instruments = df.index.get_level_values("instrument").unique()
f.write(f"Unique Instruments: {len(instruments)}\n")
f.write(f"Sample Instruments: {list(instruments[:10])}\n\n")
f.write("COLUMN GROUPS\n")
f.write("-" * 40 + "\n")
# Categorize columns
feature_cols = [c for c in df.columns if c.startswith("feature::")]
feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
f.write(f"feature:: columns: {len(feature_cols)}\n")
f.write(f"feature_ext:: columns: {len(feature_ext_cols)}\n")
f.write(f"feature_flag:: columns: {len(feature_flag_cols)}\n")
f.write(f"indus_idx:: columns: {len(indus_idx_cols)}\n")
f.write(f"*_ntrl columns: {len(feature_ntrl_cols)}\n\n")
f.write("COLUMN DTYPES\n")
f.write("-" * 40 + "\n")
dtype_counts = df.dtypes.value_counts()
for dtype, count in dtype_counts.items():
f.write(f"{dtype}: {count}\n")
f.write("\n")
f.write("NAN STATISTICS\n")
f.write("-" * 40 + "\n")
nan_counts = df.isna().sum()
cols_with_nan = nan_counts[nan_counts > 0]
f.write(f"Columns with NaN: {len(cols_with_nan)}\n")
f.write(f"Total NaN values: {df.isna().sum().sum()}\n\n")
if len(cols_with_nan) > 0:
f.write("NaN per column (top 20):\n")
for col, cnt in cols_with_nan.nlargest(20).items():
f.write(f" {col}: {cnt} ({100*cnt/len(df):.2f}%)\n")
f.write("\n")
f.write("ALL COLUMN NAMES\n")
f.write("-" * 40 + "\n")
for i, col in enumerate(df.columns):
f.write(f" {i+1}. {col}\n")
f.write("\n")
if proc_list_path and os.path.exists(proc_list_path):
f.write("PROCESSOR LIST\n")
f.write("-" * 40 + "\n")
f.write(f"Source: {proc_list_path}\n")
try:
with open(proc_list_path, "rb") as pf:
proc_list = pkl.load(pf)
f.write(f"Number of processors: {len(proc_list)}\n\n")
for i, proc in enumerate(proc_list):
f.write(f" {i+1}. {proc}\n")
except Exception as e:
f.write(f"Could not load processor list: {e}\n")
f.write("\n")
print(f"Exported metadata: {metadata_path}")
return metadata_path
def main():
args = parse_args()
# Parse dates
start_date = pd.to_datetime(args.start_date)
end_date = pd.to_datetime(args.end_date)
# Create output directory if it doesn't exist
output_dir = Path(args.output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
print("=" * 80)
print("DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE")
print("=" * 80)
print(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print(f"Output Directory: {output_dir}")
print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
print()
# Load data from Qlib pipeline
print("Step 1: Loading data from Qlib pipeline...")
print(f" Loading since_date={start_date.strftime('%Y-%m-%d')}")
try:
df = load_qlib_data(args.qlib_dataset_path, start_date.strftime("%Y-%m-%d"))
print(f" Loaded DataFrame with shape: {df.shape}")
except Exception as e:
print(f" ERROR: Failed to load data from Qlib pipeline: {e}")
sys.exit(1)
# Filter to requested date range
print("\nStep 2: Filtering to requested date range...")
df = df.loc(axis=0)[slice(start_date, end_date)]
print(f" Filtered shape: {df.shape}")
# Export full DataFrame
print("\nStep 3: Exporting full DataFrame...")
prefix = f"gold_standard_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
parquet_path = output_dir / f"{prefix}.parquet"
df.to_parquet(parquet_path)
print(f" Exported parquet: {parquet_path}")
pkl_path = output_dir / f"{prefix}.pkl"
df.to_pickle(pkl_path)
print(f" Exported pickle: {pkl_path}")
# Export column groups
print("\nStep 4: Exporting column groups...")
export_paths = export_column_groups(df, str(output_dir), prefix=prefix)
# Export metadata
print("\nStep 5: Exporting metadata...")
proc_list_path = os.path.join(args.qlib_dataset_path, "proc_list.proc")
export_metadata(df, str(output_dir), prefix=prefix, proc_list_path=proc_list_path)
# Summary
print("\n" + "=" * 80)
print("EXPORT SUMMARY")
print("=" * 80)
print(f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print(f"Output directory: {output_dir}")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"\nFiles exported:")
print(f" - {prefix}.parquet (full DataFrame)")
print(f" - {prefix}.pkl (pickle, preserves dtypes)")
print(f" - {prefix}_metadata.txt (column info, statistics)")
for group, path in export_paths.items():
print(f" - {os.path.basename(path)} ({group} columns)")
print("\nDone!")
if __name__ == "__main__":
main()

@ -1,270 +0,0 @@
#!/usr/bin/env python
"""
Dump Gold-Standard Data from Qlib Pipeline (Simple Version)
This script exports the RAW feature data from the Qlib pipeline BEFORE
any processors are applied. This is useful for debugging and comparison.
NOTE: This script loads ALL data from DolphinDB and then filters to the
requested date range. For large date ranges, this may require significant memory.
Usage:
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
"""
import argparse
import os
import sys
import pickle as pkl
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import numpy as np
# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
if not hasattr(np, 'NaN'):
np.NaN = np.nan
def parse_args():
parser = argparse.ArgumentParser(
description="Dump gold-standard raw data from Qlib pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Export a few days for debugging (recommended)
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
# Export with custom output directory
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir /path/to/output
"""
)
parser.add_argument(
"--start-date",
type=str,
default="2020-01-02",
help="Start date for data export (YYYY-MM-DD)",
)
parser.add_argument(
"--end-date",
type=str,
default="2020-01-10",
help="End date for data export (YYYY-MM-DD)",
)
parser.add_argument(
"--output-dir",
type=str,
default="../data/",
help="Output directory for exported files",
)
parser.add_argument(
"--qlib-dataset-path",
type=str,
default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
help="Path to Qlib dataset module",
)
parser.add_argument(
"--instruments",
type=str,
default=None,
help="Comma-separated list of instrument codes to export (default: all)",
)
return parser.parse_args()
def load_raw_data(qlib_dataset_path, since_date, instruments=None):
"""
Load RAW data from Qlib pipeline (before processor list is applied).
Returns a dict of DataFrames, one per column group.
Args:
qlib_dataset_path: Path to Qlib dataset module
since_date: Start date for loading (needs history before for Diff)
instruments: Optional list of instrument codes to filter
"""
import importlib.util
import ruamel.yaml as yaml
# Create a YAML instance with safe loader for backward compatibility
_yaml = yaml.YAML(typ='safe', pure=True)
def patched_safe_load(stream):
import io
if isinstance(stream, str):
stream = io.StringIO(stream)
return _yaml.load(stream)
yaml.safe_load = patched_safe_load
# Load the module directly
spec = importlib.util.spec_from_file_location(
"qlib_dataset",
os.path.join(qlib_dataset_path, "__init__.py")
)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Parse since_date
since_date_dt = pd.to_datetime(since_date)
# Load with extra history for Diff processor
load_start = (since_date_dt - timedelta(days=20)).strftime("%Y-%m-%d")
print(f" Loading raw data from handler (load_start={load_start})...")
if instruments:
print(f" Filtering instruments: {instruments[:5]}... ({len(instruments)} total)")
# Use _load_from_yaml to get raw handler data (SepDataFrame)
handler_data = module._load_from_yaml(
os.path.join(qlib_dataset_path, "handler.yaml"),
load_start
)
# Handle SepDataFrame - extract column groups
if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
group_names = list(df_dict.keys())
print(f" Handler returned SepDataFrame with groups: {group_names}")
# Filter instruments if specified
if instruments:
print(f" Filtering to specified instruments...")
for group in group_names:
if df_dict[group] is not None:
df = df_dict[group]
# Filter by instrument level
if isinstance(df.index, pd.MultiIndex):
mask = df.index.get_level_values('instrument').isin(instruments)
df_dict[group] = df[mask]
print(f" Group '{group}': {df_dict[group].shape} (filtered)")
for group in group_names:
df = df_dict[group]
if df is not None:
print(f" Group '{group}': shape={df.shape}, columns={len(df.columns)}")
return df_dict, handler_data.index
else:
print(f" Handler returned DataFrame: shape={handler_data.shape}")
return {"default": handler_data}, handler_data.index
def export_data(df_dict, index, output_dir, start_date, end_date):
"""Export data to parquet and pickle files."""
output_dir = Path(output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)
# Filter index
mask = (index >= start_date) & (index <= end_date)
filtered_index = index[mask]
print(f"\nExporting data for date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print(f" Filtered index has {len(filtered_index)} dates")
prefix = f"gold_standard_raw_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
exported_files = []
# Export each group separately
for group, df in df_dict.items():
if df is None or len(df.columns) == 0:
print(f" Skipping empty group '{group}'")
continue
# Filter by date
df_filtered = df.loc[df.index.isin(filtered_index)]
print(f" Group '{group}': {df_filtered.shape}")
# Export to parquet
parquet_path = output_dir / f"{prefix}_{group}.parquet"
df_filtered.to_parquet(parquet_path)
exported_files.append(str(parquet_path))
print(f" -> {parquet_path}")
# Export to pickle (preserves dtypes)
pkl_path = output_dir / f"{prefix}_{group}.pkl"
df_filtered.to_pickle(pkl_path)
exported_files.append(str(pkl_path))
# Also create a metadata file
metadata_path = output_dir / f"{prefix}_metadata.txt"
with open(metadata_path, "w") as f:
f.write("=" * 80 + "\n")
f.write("GOLD-STANDARD RAW DATA - METADATA\n")
f.write("=" * 80 + "\n\n")
f.write(f"Export Date: {datetime.now().isoformat()}\n")
f.write(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\n")
f.write(f"Total Dates: {len(filtered_index)}\n\n")
f.write("COLUMN GROUPS:\n")
f.write("-" * 40 + "\n")
for group, df in df_dict.items():
if df is not None:
f.write(f" {group}:\n")
f.write(f" Shape: {df.shape}\n")
f.write(f" Columns: {len(df.columns)}\n")
f.write(f" Sample columns: {list(df.columns[:5])}...\n\n")
f.write("\nPROCESSOR LIST (for reference):\n")
f.write("-" * 40 + "\n")
proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
if os.path.exists(proc_path):
with open(proc_path, "rb") as pf:
proc_list = pkl.load(pf)
f.write(f"Number of processors: {len(proc_list)}\n\n")
for i, proc in enumerate(proc_list):
f.write(f" {i+1}. {type(proc).__module__}.{type(proc).__name__}\n")
else:
f.write(f"Processor list not found: {proc_path}\n")
exported_files.append(str(metadata_path))
return exported_files
def main():
args = parse_args()
print("=" * 80)
print("DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE")
print("=" * 80)
print(f"Date Range: {args.start_date} to {args.end_date}")
print(f"Output Directory: {args.output_dir}")
print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
print()
# Load raw data
print("Step 1: Loading raw data from Qlib pipeline...")
try:
instruments = None
if args.instruments:
instruments = args.instruments.split(',')
df_dict, index = load_raw_data(args.qlib_dataset_path, args.start_date, instruments=instruments)
except Exception as e:
print(f" ERROR: Failed to load data: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Export data
print("\nStep 2: Exporting data...")
exported_files = export_data(df_dict, index, args.output_dir, args.start_date, args.end_date)
# Summary
print("\n" + "=" * 80)
print("EXPORT SUMMARY")
print("=" * 80)
print(f"Date range: {args.start_date} to {args.end_date}")
print(f"Output directory: {Path(args.output_dir).resolve()}")
print(f"\nFiles exported ({len(exported_files)}):")
for f in exported_files:
print(f" - {f}")
print("\nDone!")
if __name__ == "__main__":
main()

@ -1,186 +0,0 @@
#!/usr/bin/env python
"""
Regenerate beta embeddings for a few days of sample data.
This script generates embeddings for a small date range to test the pipeline.
"""
import os
import sys
import pickle as pkl
import numpy as np
import polars as pl
import torch
import torch.nn as nn
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent))
# Import from the main generate script
from generate_beta_embedding import (
load_all_data,
merge_data_sources,
apply_feature_pipeline,
prepare_vae_features,
load_vae_model,
encode_with_vae,
load_qlib_processor_params,
VAE_INPUT_DIM,
OUTPUT_DIR,
)
# Sample dates for testing (5 consecutive trading days)
SAMPLE_DATES = [
"2019-01-02",
"2019-01-03",
"2019-01-04",
"2019-01-07",
"2019-01-08",
]
VAE_MODEL_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/model/csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/module.pt"
def generate_sample_embeddings(
dates: List[str] = SAMPLE_DATES,
output_file: str = "embedding_0_7_beta_sample.parquet",
use_vae: bool = True
) -> pl.DataFrame:
"""
Generate embeddings for a sample of dates.
Args:
dates: List of dates in YYYY-MM-DD format
output_file: Output parquet file path
use_vae: Whether to use VAE for encoding (or random embeddings)
"""
start_date = dates[0]
end_date = dates[-1]
print("=" * 60)
print("Generating Sample Beta Embeddings")
print(f"Dates: {dates}")
print(f"Use VAE: {use_vae}")
print("=" * 60)
# Load all data sources
df_alpha, df_kline, df_flag, df_industry = load_all_data(start_date, end_date)
print(f"\nLoaded data:")
print(f" Alpha158: {df_alpha.shape}")
print(f" Kline: {df_kline.shape}")
print(f" Flags: {df_flag.shape}")
print(f" Industry: {df_industry.shape}")
# Filter to only the sample dates
date_ints = [int(d.replace("-", "")) for d in dates]
df_alpha = df_alpha.filter(pl.col("datetime").is_in(date_ints))
df_kline = df_kline.filter(pl.col("datetime").is_in(date_ints))
df_flag = df_flag.filter(pl.col("datetime").is_in(date_ints))
df_industry = df_industry.filter(pl.col("datetime").is_in(date_ints))
print(f"\nAfter filtering to sample dates:")
print(f" Alpha158: {df_alpha.shape}")
print(f" Kline: {df_kline.shape}")
print(f" Flags: {df_flag.shape}")
print(f" Industry: {df_industry.shape}")
# Merge data sources
df = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
print(f"\nMerged data shape: {df.shape}")
# Save datetime and instrument before processing
datetime_col = df["datetime"].clone()
instrument_col = df["instrument"].clone()
# Apply feature transformation pipeline
df_processed, feature_cols, norm_feature_cols, market_flag_for_vae = apply_feature_pipeline(df)
# Prepare features for VAE
features = prepare_vae_features(
df_processed, feature_cols,
norm_feature_cols=norm_feature_cols,
market_flag_for_vae=market_flag_for_vae
)
print(f"\nFeature matrix shape: {features.shape}")
# Encode with VAE
if use_vae:
try:
model = load_vae_model(VAE_MODEL_PATH)
embeddings = encode_with_vae(features, model)
print(f"\nVAE encoding successful!")
except Exception as e:
print(f"\nVAE encoding failed: {e}")
import traceback
traceback.print_exc()
print("\nFalling back to random embeddings...")
np.random.seed(42)
embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
else:
print("\nUsing random embeddings (VAE disabled)...")
np.random.seed(42)
embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
# Create output DataFrame
embedding_cols = [f"embedding_{i}" for i in range(embeddings.shape[1])]
result_data = {
"datetime": datetime_col.to_list(),
"instrument": instrument_col.to_list(),
**{col_name: embeddings[:, i].tolist() for i, col_name in enumerate(embedding_cols)}
}
df_result = pl.DataFrame(result_data)
# Ensure output directory exists
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Save to parquet
df_result.write_parquet(output_path)
print(f"\nEmbeddings saved to: {output_path}")
print(f"Output shape: {df_result.shape}")
print(f"\nSample output:")
print(df_result.head(10))
# Print summary statistics
print("\n" + "=" * 60)
print("Summary Statistics")
print("=" * 60)
print(f"Total samples: {len(df_result)}")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Date range: {df_result['datetime'].min()} to {df_result['datetime'].max()}")
print(f"Instruments: {df_result['instrument'].n_unique()}")
print(f"Embedding mean: {np.mean(embeddings):.6f}")
print(f"Embedding std: {np.std(embeddings):.6f}")
print(f"Embedding min: {np.min(embeddings):.6f}")
print(f"Embedding max: {np.max(embeddings):.6f}")
return df_result
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate sample beta embeddings")
parser.add_argument("--dates", nargs="+", default=SAMPLE_DATES,
help="List of dates (YYYY-MM-DD)")
parser.add_argument("--output", type=str, default="embedding_0_7_beta_sample.parquet",
help="Output parquet file")
parser.add_argument("--no-vae", action="store_true",
help="Skip VAE encoding (use random embeddings)")
args = parser.parse_args()
generate_sample_embeddings(
dates=args.dates,
output_file=args.output,
use_vae=not args.no_vae
)
print("\nDone!")

@ -1,394 +0,0 @@
[2715583:MainThread](2026-02-26 19:58:16,674) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2715583:MainThread](2026-02-26 19:58:16,680) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2715583:MainThread](2026-02-26 19:58:16,681) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading data from Qlib pipeline...
Loading since_date=2020-01-02
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-12-03 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26)[2715583:MainThread](2026-02-26 19:58:16,707) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2715583:MainThread](2026-02-26 19:58:16,707) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2715583:MainThread](2026-02-26 19:58:17,067) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
[2715583:MainThread](2026-02-26 20:05:39,665) INFO - qlib.timer - [log.py:117] - Time cost: 442.946s | DDB query: Done
[2715583:MainThread](2026-02-26 20:05:40,469) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
,
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-12-03 00:00:00')},
'module_path': 'qlib.contrib.data.agg_handler'},
'load_end': datetime.date(2026, 2, 26),
'load_start': Timestamp('2019-12-03 00:00:00'),
'market': 'csiallx',
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
'region': 'cn'}}
Query config:
#alpha158: 1;
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88b0587d0>
[2715583:MainThread](2026-02-26 20:07:46,118) INFO - qlib.timer - [log.py:117] - Time cost: 115.964s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:07:53,273) INFO - qlib.timer - [log.py:117] - Time cost: 576.561s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x70e88b086d80>) Done
[2715583:MainThread](2026-02-26 20:07:53,274) INFO - qlib.timer - [log.py:117] - Time cost: 576.562s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x70e88b086d80>) Done
[2715583:MainThread](2026-02-26 20:07:53,276) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:07:56,700) INFO - qlib.timer - [log.py:117] - Time cost: 3.423s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:07:58,185) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
KMID KLEN ... VSUMD30 VSUMD60
datetime instrument ...
2019-12-03 SH600000 0.004234 0.011008 ... -0.031454 -0.009671
SH600004 0.015467 0.031529 ... -0.004401 0.007701
SH600006 0.022573 0.033860 ... 0.060561 -0.000159
SH600007 0.012129 0.025470 ... 0.008489 -0.054056
SH600008 0.006173 0.009259 ... -0.088065 -0.080770
... ... ... ... ... ...
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
[6886779 rows x 158 columns]
[2715583:MainThread](2026-02-26 20:07:58,186) INFO - qlib.timer - [log.py:117] - Time cost: 4.911s | Fetching dataframe Done
[2715583:MainThread](2026-02-26 20:07:58,203) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
[2715583:MainThread](2026-02-26 20:08:15,182) INFO - qlib.timer - [log.py:117] - Time cost: 16.990s | DDB query: Done
[2715583:MainThread](2026-02-26 20:08:15,974) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:08:16,548) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,con_rating_strength from
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
[2715583:MainThread](2026-02-26 20:08:27,838) INFO - qlib.timer - [log.py:117] - Time cost: 11.299s | DDB query: Done
[2715583:MainThread](2026-02-26 20:08:28,690) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
[2715583:MainThread](2026-02-26 20:09:53,616) INFO - qlib.timer - [log.py:117] - Time cost: 81.815s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:09:54,168) INFO - qlib.timer - [log.py:117] - Time cost: 115.981s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x70ea4ba85f40>) Done
[2715583:MainThread](2026-02-26 20:09:54,169) INFO - qlib.timer - [log.py:117] - Time cost: 115.982s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x70ea4ba85f40>) Done
[2715583:MainThread](2026-02-26 20:09:54,170) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:09:54,893) INFO - qlib.timer - [log.py:117] - Time cost: 0.723s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:09:54,901) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
turnover free_turnover log_size con_rating_strength
datetime instrument
2019-12-03 SH600000 0.0696 0.1275 17.322001 0.6618
SH600004 0.6009 1.2276 15.077468 0.8269
SH600006 0.5976 1.5087 13.716795 1.0000
SH600007 0.0961 0.4969 14.334991 0.7500
SH600008 0.0967 0.1793 14.432563 0.6591
... ... ... ... ...
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
SZ301662 12.5950 12.5950 12.681215 NaN
SZ301665 14.0077 14.0077 11.719415 NaN
SZ301678 6.6518 6.6518 12.799973 NaN
SZ302132 1.3868 3.0296 15.359885 NaN
[7601552 rows x 4 columns]
[2715583:MainThread](2026-02-26 20:09:54,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.732s | Fetching dataframe Done
[2715583:MainThread](2026-02-26 20:09:54,917) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
[2715583:MainThread](2026-02-26 20:10:15,465) INFO - qlib.timer - [log.py:117] - Time cost: 20.556s | DDB query: Done
[2715583:MainThread](2026-02-26 20:10:16,265) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:10:16,775) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
[2715583:MainThread](2026-02-26 20:10:36,740) INFO - qlib.timer - [log.py:117] - Time cost: 19.975s | DDB query: Done
[2715583:MainThread](2026-02-26 20:10:37,558) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:12:04,978) INFO - qlib.timer - [log.py:117] - Time cost: 84.148s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:12:05,899) INFO - qlib.timer - [log.py:117] - Time cost: 130.996s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x70e88d06acc0>) Done
[2715583:MainThread](2026-02-26 20:12:05,900) INFO - qlib.timer - [log.py:117] - Time cost: 130.997s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x70e88d06acc0>) Done
[2715583:MainThread](2026-02-26 20:12:05,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:12:06,745) INFO - qlib.timer - [log.py:117] - Time cost: 0.842s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:12:06,758) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
IsZt IsDt IsN ... open_stop close_stop high_stop
datetime instrument ...
2019-12-03 SH600000 False False False ... False False False
SH600004 False False False ... False False False
SH600006 False False False ... False False False
SH600007 False False False ... False False False
SH600008 False False False ... False False False
... ... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False ... False False False
SZ301662 False False False ... False False False
SZ301665 False False False ... False False False
SZ301678 False False False ... False False False
SZ302132 False False False ... False False False
[6903684 rows x 12 columns]
[2715583:MainThread](2026-02-26 20:12:06,759) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
[2715583:MainThread](2026-02-26 20:12:06,777) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
[2715583:MainThread](2026-02-26 20:12:08,840) INFO - qlib.timer - [log.py:117] - Time cost: 2.073s | DDB query: Done
[2715583:MainThread](2026-02-26 20:12:08,849) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:13:26,572) INFO - qlib.timer - [log.py:117] - Time cost: 77.719s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:13:26,601) INFO - qlib.timer - [log.py:117] - Time cost: 79.839s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x70e67060acc0>) Done
[2715583:MainThread](2026-02-26 20:13:26,602) INFO - qlib.timer - [log.py:117] - Time cost: 79.840s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x70e67060acc0>) Done
[2715583:MainThread](2026-02-26 20:13:26,603) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:13:26,612) INFO - qlib.timer - [log.py:117] - Time cost: 0.008s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:13:26,633) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
datetime instrument ...
2026-02-09 SH600000 False False ... False False
SH600004 False False ... False False
SH600006 False False ... False False
SH600007 False False ... False False
SH600008 False False ... False False
... ... ... ... ... ...
2026-02-26 SZ301658 False False ... False False
SZ301662 False False ... False False
SZ301665 False False ... False False
SZ301678 False False ... False False
SZ302132 False False ... False False
[41168 rows x 30 columns]
[2715583:MainThread](2026-02-26 20:13:26,634) INFO - qlib.timer - [log.py:117] - Time cost: 0.031s | Fetching dataframe Done
[2715583:MainThread](2026-02-26 20:13:26,652) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
[2715583:MainThread](2026-02-26 20:13:55,744) INFO - qlib.timer - [log.py:117] - Time cost: 29.102s | DDB query: Done
[2715583:MainThread](2026-02-26 20:13:56,520) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:15:27,625) INFO - qlib.timer - [log.py:117] - Time cost: 90.586s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.621s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x70e88c9710d0>) Done
[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.622s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x70e88c9710d0>) Done
[2715583:MainThread](2026-02-26 20:15:28,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:15:28,867) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:15:28,875) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
ST_Y ST_S ST_T ST_L ST_Z ST_X
datetime instrument
2019-12-03 SH600000 False False False False False False
SH600004 False False False False False False
SH600006 False False False False False False
SH600007 False False False False False False
SH600008 False False False False False False
... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False False False False
SZ301662 False False False False False False
SZ301665 False False False False False False
SZ301678 False False False False False False
SZ302132 False False False False False False
[6903687 rows x 6 columns]
[2715583:MainThread](2026-02-26 20:15:28,876) INFO - qlib.timer - [log.py:117] - Time cost: 0.617s | Fetching dataframe Done
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
group_list = [_df.resample("M", level="datetime")\
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88bd2a000>
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88bd2a000>
Query config:
#concepts: 2;
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88c9cf6e0>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88c9cf6e0>
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e6706082f0>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e65fdafd40>
[2715583:MainThread](2026-02-26 20:15:32,735) INFO - qlib.timer - [log.py:117] - Time cost: 3.858s | Concat index: Done
[2715583:MainThread](2026-02-26 20:15:32,737) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
[2715583:MainThread](2026-02-26 20:15:36,349) INFO - qlib.timer - [log.py:117] - Time cost: 3.611s | Creating SepDataFrame: Done
[2715583:MainThread](2026-02-26 20:15:37,245) INFO - qlib.timer - [log.py:117] - Time cost: 1040.537s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x70e88b12b3e0>) Done
[2715583:MainThread](2026-02-26 20:15:37,246) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2715583:MainThread](2026-02-26 20:15:37,248) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2715583:MainThread](2026-02-26 20:15:37,265) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2715583:MainThread](2026-02-26 20:15:37,266) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2715583:MainThread](2026-02-26 20:15:37,293) INFO - qlib.timer - [log.py:117] - Time cost: 0.047s | fit & process data Done
[2715583:MainThread](2026-02-26 20:15:37,294) INFO - qlib.timer - [log.py:117] - Time cost: 1040.587s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x70e88b12b3e0>) Done
[2715583:MainThread](2026-02-26 20:15:37,963) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor <qlib.contrib.data.processor.common.Diff object at 0x70e88bf4af30>
[2715583:MainThread](2026-02-26 20:15:40,135) INFO - qlib.timer - [log.py:117] - Time cost: 2.171s | Diff Done
[2715583:MainThread](2026-02-26 20:15:40,136) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor <qlib.contrib.data.processor.flag.FlagMarketInjector object at 0x70e88cd8fd40>
All processors are readonly
All processors are readonly
All processors are readonly
Did load data from config: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml
Did load norm from: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc
Will assign `feature_ext` with
turnover ... con_rating_strength_diff
datetime instrument ...
2026-02-09 SH600000 0.1837 ... 0.0
SH600004 0.6948 ... 0.0
SH600006 0.5542 ... 0.0
SH600007 0.2057 ... 0.0
SH600008 0.9809 ... 0.0
... ... ... ...
2026-02-26 SZ301658 6.0785 ... 0.0
SZ301662 12.5950 ... 0.0
SZ301665 14.0077 ... 0.0
SZ301678 6.6518 ... 0.0
SZ302132 1.3868 ... 0.0
[41085 rows x 8 columns]
---
ERROR: Failed to load data from Qlib pipeline: Cannot convert non-finite values (NA or inf) to integer

@ -1,373 +0,0 @@
[2730312:MainThread](2026-02-26 21:28:33,675) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2730312:MainThread](2026-02-26 21:28:33,679) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2730312:MainThread](2026-02-26 21:28:33,680) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading data from Qlib pipeline...
Loading since_date=2020-01-02
Loading raw data from handler.yaml...
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-12-13 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': [2730312:MainThread](2026-02-26 21:28:33,704) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2730312:MainThread](2026-02-26 21:28:33,704) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2730312:MainThread](2026-02-26 21:28:34,011) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
[2730312:MainThread](2026-02-26 21:36:00,317) INFO - qlib.timer - [log.py:117] - Time cost: 446.602s | DDB query: Done
[2730312:MainThread](2026-02-26 21:36:01,106) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-12-13 00:00:00')},
'module_path': 'qlib.contrib.data.agg_handler'},
'load_end': datetime.date(2026, 2, 26),
'load_start': Timestamp('2019-12-13 00:00:00'),
'market': 'csiallx',
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
'region': 'cn'}}
Query config:
#alpha158: 1;
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f37e75a60>
[2730312:MainThread](2026-02-26 21:38:13,636) INFO - qlib.timer - [log.py:117] - Time cost: 123.423s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:38:20,733) INFO - qlib.timer - [log.py:117] - Time cost: 587.024s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x761f366b3bc0>) Done
[2730312:MainThread](2026-02-26 21:38:20,734) INFO - qlib.timer - [log.py:117] - Time cost: 587.026s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x761f366b3bc0>) Done
[2730312:MainThread](2026-02-26 21:38:20,736) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:38:24,302) INFO - qlib.timer - [log.py:117] - Time cost: 3.564s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:38:25,946) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
KMID KLEN ... VSUMD30 VSUMD60
datetime instrument ...
2019-12-13 SH600000 0.011686 0.015025 ... -0.011573 0.039735
SH600004 0.000000 0.009169 ... -0.146051 0.024757
SH600006 -0.004329 0.015152 ... 0.136883 0.024626
SH600007 0.005590 0.019005 ... -0.012912 0.017215
SH600008 0.012270 0.012270 ... 0.039878 -0.013888
... ... ... ... ... ...
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
[6858048 rows x 158 columns]
[2730312:MainThread](2026-02-26 21:38:25,947) INFO - qlib.timer - [log.py:117] - Time cost: 5.212s | Fetching dataframe Done
[2730312:MainThread](2026-02-26 21:38:25,965) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
[2730312:MainThread](2026-02-26 21:38:43,081) INFO - qlib.timer - [log.py:117] - Time cost: 17.127s | DDB query: Done
[2730312:MainThread](2026-02-26 21:38:43,874) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:38:44,458) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,con_rating_strength from
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
[2730312:MainThread](2026-02-26 21:38:55,720) INFO - qlib.timer - [log.py:117] - Time cost: 11.271s | DDB query: Done
[2730312:MainThread](2026-02-26 21:38:56,586) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
[2730312:MainThread](2026-02-26 21:40:21,007) INFO - qlib.timer - [log.py:117] - Time cost: 81.315s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.627s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7620fb822930>) Done
[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.628s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7620fb822930>) Done
[2730312:MainThread](2026-02-26 21:40:21,577) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:40:22,309) INFO - qlib.timer - [log.py:117] - Time cost: 0.731s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:40:22,317) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
turnover free_turnover log_size con_rating_strength
datetime instrument
2019-12-13 SH600000 0.2118 0.3879 17.343685 0.7143
SH600004 0.7518 1.5357 15.099485 0.8214
SH600006 0.7827 1.9762 13.732129 1.0000
SH600007 0.1368 0.7071 14.409998 0.7500
SH600008 0.2152 0.3990 14.444757 0.7500
... ... ... ... ...
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
SZ301662 12.5950 12.5950 12.681215 NaN
SZ301665 14.0077 14.0077 11.719415 NaN
SZ301678 6.6518 6.6518 12.799973 NaN
SZ302132 1.3868 3.0296 15.359885 NaN
[7572626 rows x 4 columns]
[2730312:MainThread](2026-02-26 21:40:22,318) INFO - qlib.timer - [log.py:117] - Time cost: 0.741s | Fetching dataframe Done
[2730312:MainThread](2026-02-26 21:40:22,334) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
[2730312:MainThread](2026-02-26 21:40:43,075) INFO - qlib.timer - [log.py:117] - Time cost: 20.751s | DDB query: Done
[2730312:MainThread](2026-02-26 21:40:43,889) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:40:44,394) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
[2730312:MainThread](2026-02-26 21:41:04,632) INFO - qlib.timer - [log.py:117] - Time cost: 20.246s | DDB query: Done
[2730312:MainThread](2026-02-26 21:41:05,434) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:42:33,029) INFO - qlib.timer - [log.py:117] - Time cost: 84.294s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:42:34,049) INFO - qlib.timer - [log.py:117] - Time cost: 131.730s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x761f37e74470>) Done
[2730312:MainThread](2026-02-26 21:42:34,050) INFO - qlib.timer - [log.py:117] - Time cost: 131.731s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x761f37e74470>) Done
[2730312:MainThread](2026-02-26 21:42:34,051) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:42:34,895) INFO - qlib.timer - [log.py:117] - Time cost: 0.843s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:42:34,907) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
IsZt IsDt IsN ... open_stop close_stop high_stop
datetime instrument ...
2019-12-13 SH600000 False False False ... False False False
SH600004 False False False ... False False False
SH600006 False False False ... False False False
SH600007 False False False ... False False False
SH600008 False False False ... False False False
... ... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False ... False False False
SZ301662 False False False ... False False False
SZ301665 False False False ... False False False
SZ301678 False False False ... False False False
SZ302132 False False False ... False False False
[6874830 rows x 12 columns]
[2730312:MainThread](2026-02-26 21:42:34,908) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
[2730312:MainThread](2026-02-26 21:42:34,927) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
[2730312:MainThread](2026-02-26 21:42:36,986) INFO - qlib.timer - [log.py:117] - Time cost: 2.069s | DDB query: Done
[2730312:MainThread](2026-02-26 21:42:36,996) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:43:53,198) INFO - qlib.timer - [log.py:117] - Time cost: 76.199s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:43:53,230) INFO - qlib.timer - [log.py:117] - Time cost: 78.318s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x761f381e1c40>) Done
[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 78.319s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x761f381e1c40>) Done
[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:43:53,239) INFO - qlib.timer - [log.py:117] - Time cost: 0.007s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:43:53,257) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
datetime instrument ...
2026-02-09 SH600000 False False ... False False
SH600004 False False ... False False
SH600006 False False ... False False
SH600007 False False ... False False
SH600008 False False ... False False
... ... ... ... ... ...
2026-02-26 SZ301658 False False ... False False
SZ301662 False False ... False False
SZ301665 False False ... False False
SZ301678 False False ... False False
SZ302132 False False ... False False
[41168 rows x 30 columns]
[2730312:MainThread](2026-02-26 21:43:53,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.027s | Fetching dataframe Done
[2730312:MainThread](2026-02-26 21:43:53,274) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
[2730312:MainThread](2026-02-26 21:44:44,876) INFO - qlib.timer - [log.py:117] - Time cost: 51.611s | DDB query: Done
[2730312:MainThread](2026-02-26 21:44:45,602) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:46:07,184) INFO - qlib.timer - [log.py:117] - Time cost: 81.056s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:46:07,747) INFO - qlib.timer - [log.py:117] - Time cost: 134.487s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x761f387b3080>) Done
[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 134.488s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x761f387b3080>) Done
[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:46:08,349) INFO - qlib.timer - [log.py:117] - Time cost: 0.600s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:46:08,358) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
ST_Y ST_S ST_T ST_L ST_Z ST_X
datetime instrument
2019-12-13 SH600000 False False False False False False
SH600004 False False False False False False
SH600006 False False False False False False
SH600007 False False False False False False
SH600008 False False False False False False
... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False False False False
SZ301662 False False False False False False
SZ301665 False False False False False False
SZ301678 False False False False False False
SZ302132 False False False False False False
[6874833 rows x 6 columns]
[2730312:MainThread](2026-02-26 21:46:08,359) INFO - qlib.timer - [log.py:117] - Time cost: 0.610s | Fetching dataframe Done
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
group_list = [_df.resample("M", level="datetime")\
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x76203dfa91f0>
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x76203dfa91f0>
Query config:
#concepts: 2;
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f346e7aa0>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f346e7aa0>
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761cc3995760>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761a968d1d00>
[2730312:MainThread](2026-02-26 21:46:11,623) INFO - qlib.timer - [log.py:117] - Time cost: 3.264s | Concat index: Done
[2730312:MainThread](2026-02-26 21:46:11,625) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
[2730312:MainThread](2026-02-26 21:46:15,058) INFO - qlib.timer - [log.py:117] - Time cost: 3.433s | Creating SepDataFrame: Done
[2730312:MainThread](2026-02-26 21:46:15,928) INFO - qlib.timer - [log.py:117] - Time cost: 1062.224s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x761f36612720>) Done
[2730312:MainThread](2026-02-26 21:46:15,929) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2730312:MainThread](2026-02-26 21:46:15,931) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2730312:MainThread](2026-02-26 21:46:15,935) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2730312:MainThread](2026-02-26 21:46:15,936) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2730312:MainThread](2026-02-26 21:46:15,939) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2730312:MainThread](2026-02-26 21:46:15,940) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 0.014s | fit & process data Done
[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 1062.239s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x761f36612720>) Done
All processors are readonly
All processors are readonly
All processors are readonly
ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'

@ -1,373 +0,0 @@
[2734404:MainThread](2026-02-26 22:10:11,609) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading data from Qlib pipeline...
Loading since_date=2020-01-02
Loading data with handler (load_start=2019-12-13)...
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
[2734404:MainThread](2026-02-26 22:10:11,634) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2734404:MainThread](2026-02-26 22:10:11,634) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2734404:MainThread](2026-02-26 22:10:11,842) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
[2734404:MainThread](2026-02-26 22:17:41,432) INFO - qlib.timer - [log.py:117] - Time cost: 449.788s | DDB query: Done
[2734404:MainThread](2026-02-26 22:17:42,271) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'module_path': 'qlib.contrib.data.agg_handler'},
'load_end': datetime.date(2026, 2, 26),
'load_start': Timestamp('2019-11-23 00:00:00'),
'market': 'csiallx',
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
'region': 'cn'}}
Query config:
#alpha158: 1;
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e04773e0>
[2734404:MainThread](2026-02-26 22:19:46,550) INFO - qlib.timer - [log.py:117] - Time cost: 115.118s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:19:53,556) INFO - qlib.timer - [log.py:117] - Time cost: 581.918s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x71c5e0475d60>) Done
[2734404:MainThread](2026-02-26 22:19:53,557) INFO - qlib.timer - [log.py:117] - Time cost: 581.920s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x71c5e0475d60>) Done
[2734404:MainThread](2026-02-26 22:19:53,560) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:19:57,060) INFO - qlib.timer - [log.py:117] - Time cost: 3.499s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:19:58,834) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
KMID KLEN ... VSUMD30 VSUMD60
datetime instrument ...
2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
SH600004 -0.013806 0.030012 ... -0.017610 0.039195
SH600006 0.009238 0.016166 ... -0.034782 -0.014306
SH600007 -0.014749 0.018879 ... -0.032427 0.034279
SH600008 0.009259 0.024691 ... -0.063490 0.003978
... ... ... ... ... ...
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
[6908346 rows x 158 columns]
[2734404:MainThread](2026-02-26 22:19:58,835) INFO - qlib.timer - [log.py:117] - Time cost: 5.276s | Fetching dataframe Done
[2734404:MainThread](2026-02-26 22:19:59,042) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
[2734404:MainThread](2026-02-26 22:20:16,326) INFO - qlib.timer - [log.py:117] - Time cost: 17.485s | DDB query: Done
[2734404:MainThread](2026-02-26 22:20:17,102) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:20:17,676) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,con_rating_strength from
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
[2734404:MainThread](2026-02-26 22:20:29,343) INFO - qlib.timer - [log.py:117] - Time cost: 11.676s | DDB query: Done
[2734404:MainThread](2026-02-26 22:20:30,245) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
[2734404:MainThread](2026-02-26 22:21:55,033) INFO - qlib.timer - [log.py:117] - Time cost: 81.592s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:21:55,586) INFO - qlib.timer - [log.py:117] - Time cost: 116.751s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x71c7a492c4d0>) Done
[2734404:MainThread](2026-02-26 22:21:55,587) INFO - qlib.timer - [log.py:117] - Time cost: 116.752s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x71c7a492c4d0>) Done
[2734404:MainThread](2026-02-26 22:21:55,588) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:21:56,302) INFO - qlib.timer - [log.py:117] - Time cost: 0.713s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:21:56,309) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
turnover free_turnover log_size con_rating_strength
datetime instrument
2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
SH600004 0.9386 1.9173 15.039255 0.8125
SH600006 0.2566 0.6479 13.680836 1.0000
SH600007 0.1647 0.8513 14.335590 0.7500
SH600008 0.1813 0.3362 14.435625 0.6875
... ... ... ... ...
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
SZ301662 12.5950 12.5950 12.681215 NaN
SZ301665 14.0077 14.0077 11.719415 NaN
SZ301678 6.6518 6.6518 12.799973 NaN
SZ302132 1.3868 3.0296 15.359885 NaN
[7623242 rows x 4 columns]
[2734404:MainThread](2026-02-26 22:21:56,310) INFO - qlib.timer - [log.py:117] - Time cost: 0.722s | Fetching dataframe Done
[2734404:MainThread](2026-02-26 22:21:56,327) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
[2734404:MainThread](2026-02-26 22:22:17,215) INFO - qlib.timer - [log.py:117] - Time cost: 20.899s | DDB query: Done
[2734404:MainThread](2026-02-26 22:22:17,952) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:22:18,463) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
[2734404:MainThread](2026-02-26 22:22:38,963) INFO - qlib.timer - [log.py:117] - Time cost: 20.509s | DDB query: Done
[2734404:MainThread](2026-02-26 22:22:39,774) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:24:07,744) INFO - qlib.timer - [log.py:117] - Time cost: 84.654s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:24:08,702) INFO - qlib.timer - [log.py:117] - Time cost: 132.391s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x71c5e0847cb0>) Done
[2734404:MainThread](2026-02-26 22:24:08,703) INFO - qlib.timer - [log.py:117] - Time cost: 132.392s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x71c5e0847cb0>) Done
[2734404:MainThread](2026-02-26 22:24:08,704) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:24:09,549) INFO - qlib.timer - [log.py:117] - Time cost: 0.844s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:24:09,561) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
IsZt IsDt IsN ... open_stop close_stop high_stop
datetime instrument ...
2019-11-25 SH600000 False False False ... False False False
SH600004 False False False ... False False False
SH600006 False False False ... False False False
SH600007 False False False ... False False False
SH600008 False False False ... False False False
... ... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False ... False False False
SZ301662 False False False ... False False False
SZ301665 False False False ... False False False
SZ301678 False False False ... False False False
SZ302132 False False False ... False False False
[6925320 rows x 12 columns]
[2734404:MainThread](2026-02-26 22:24:09,562) INFO - qlib.timer - [log.py:117] - Time cost: 0.858s | Fetching dataframe Done
[2734404:MainThread](2026-02-26 22:24:09,760) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
[2734404:MainThread](2026-02-26 22:24:11,809) INFO - qlib.timer - [log.py:117] - Time cost: 2.238s | DDB query: Done
[2734404:MainThread](2026-02-26 22:24:11,822) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:25:28,259) INFO - qlib.timer - [log.py:117] - Time cost: 76.433s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x71c139b2af60>) Done
[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x71c139b2af60>) Done
[2734404:MainThread](2026-02-26 22:25:28,286) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:25:28,290) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:25:28,310) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
datetime instrument ...
2026-02-09 SH600000 False False ... False False
SH600004 False False ... False False
SH600006 False False ... False False
SH600007 False False ... False False
SH600008 False False ... False False
... ... ... ... ... ...
2026-02-26 SZ301658 False False ... False False
SZ301662 False False ... False False
SZ301665 False False ... False False
SZ301678 False False ... False False
SZ302132 False False ... False False
[41168 rows x 30 columns]
[2734404:MainThread](2026-02-26 22:25:28,311) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
[2734404:MainThread](2026-02-26 22:25:28,470) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
[2734404:MainThread](2026-02-26 22:25:58,108) INFO - qlib.timer - [log.py:117] - Time cost: 29.791s | DDB query: Done
[2734404:MainThread](2026-02-26 22:25:58,818) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:27:21,291) INFO - qlib.timer - [log.py:117] - Time cost: 81.957s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:27:21,828) INFO - qlib.timer - [log.py:117] - Time cost: 113.516s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x71c5e0981ca0>) Done
[2734404:MainThread](2026-02-26 22:27:21,829) INFO - qlib.timer - [log.py:117] - Time cost: 113.517s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x71c5e0981ca0>) Done
[2734404:MainThread](2026-02-26 22:27:21,830) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:27:22,439) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
ST_Y ST_S ST_T ST_L ST_Z ST_X
datetime instrument
2019-11-25 SH600000 False False False False False False
SH600004 False False False False False False
SH600006 False False False False False False
SH600007 False False False False False False
SH600008 False False False False False False
... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False False False False
SZ301662 False False False False False False
SZ301665 False False False False False False
SZ301678 False False False False False False
SZ302132 False False False False False False
[6925323 rows x 6 columns]
[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.timer - [log.py:117] - Time cost: 0.618s | Fetching dataframe Done
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
group_list = [_df.resample("M", level="datetime")\
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e4d5d940>
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e4d5d940>
Query config:
#concepts: 2;
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c13b9b8b60>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c13b9b8b60>
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c139b28aa0>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e07e8f20>
[2734404:MainThread](2026-02-26 22:27:25,764) INFO - qlib.timer - [log.py:117] - Time cost: 3.315s | Concat index: Done
[2734404:MainThread](2026-02-26 22:27:25,766) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
[2734404:MainThread](2026-02-26 22:27:29,485) INFO - qlib.timer - [log.py:117] - Time cost: 3.718s | Creating SepDataFrame: Done
[2734404:MainThread](2026-02-26 22:27:30,310) INFO - qlib.timer - [log.py:117] - Time cost: 1038.675s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x71c5e17ec230>) Done
[2734404:MainThread](2026-02-26 22:27:30,311) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2734404:MainThread](2026-02-26 22:27:30,313) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2734404:MainThread](2026-02-26 22:27:30,318) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2734404:MainThread](2026-02-26 22:27:30,319) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2734404:MainThread](2026-02-26 22:27:30,322) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2734404:MainThread](2026-02-26 22:27:30,323) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2734404:MainThread](2026-02-26 22:27:30,326) INFO - qlib.timer - [log.py:117] - Time cost: 0.015s | fit & process data Done
[2734404:MainThread](2026-02-26 22:27:30,327) INFO - qlib.timer - [log.py:117] - Time cost: 1038.692s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x71c5e17ec230>) Done
All processors are readonly
All processors are readonly
All processors are readonly
ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'

@ -1,321 +0,0 @@
[2739486:MainThread](2026-02-26 22:59:30,849) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2739486:MainThread](2026-02-26 22:59:30,854) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2739486:MainThread](2026-02-26 22:59:30,855) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading data from Qlib pipeline...
Loading since_date=2020-01-02
Loading data with handler (load_start=2019-12-13)...
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
[2739486:MainThread](2026-02-26 22:59:30,878) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2739486:MainThread](2026-02-26 22:59:30,878) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2739486:MainThread](2026-02-26 22:59:30,938) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
[2739486:MainThread](2026-02-26 23:07:16,353) INFO - qlib.timer - [log.py:117] - Time cost: 465.464s | DDB query: Done
[2739486:MainThread](2026-02-26 23:07:17,149) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'module_path': 'qlib.contrib.data.agg_handler'},
'load_end': datetime.date(2026, 2, 26),
'load_start': Timestamp('2019-11-23 00:00:00'),
'market': 'csiallx',
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
'region': 'cn'}}
Query config:
#alpha158: 1;
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71847694be90>
[2739486:MainThread](2026-02-26 23:09:19,001) INFO - qlib.timer - [log.py:117] - Time cost: 112.707s | Instruments filter: Done
[2739486:MainThread](2026-02-26 23:09:26,016) INFO - qlib.timer - [log.py:117] - Time cost: 595.133s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x7184769a5fa0>) Done
[2739486:MainThread](2026-02-26 23:09:26,017) INFO - qlib.timer - [log.py:117] - Time cost: 595.135s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x7184769a5fa0>) Done
[2739486:MainThread](2026-02-26 23:09:26,019) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2739486:MainThread](2026-02-26 23:09:29,432) INFO - qlib.timer - [log.py:117] - Time cost: 3.412s | fetch_df_by_index Done
[2739486:MainThread](2026-02-26 23:09:31,228) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
KMID KLEN ... VSUMD30 VSUMD60
datetime instrument ...
2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
SH600004 -0.013806 0.030012 ... -0.017610 0.039195
SH600006 0.009238 0.016166 ... -0.034782 -0.014306
SH600007 -0.014749 0.018879 ... -0.032427 0.034279
SH600008 0.009259 0.024691 ... -0.063490 0.003978
... ... ... ... ... ...
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
[6908346 rows x 158 columns]
[2739486:MainThread](2026-02-26 23:09:31,229) INFO - qlib.timer - [log.py:117] - Time cost: 5.211s | Fetching dataframe Done
[2739486:MainThread](2026-02-26 23:09:31,242) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
[2739486:MainThread](2026-02-26 23:09:54,142) INFO - qlib.timer - [log.py:117] - Time cost: 22.909s | DDB query: Done
[2739486:MainThread](2026-02-26 23:09:54,927) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:09:55,507) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,con_rating_strength from
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
[2739486:MainThread](2026-02-26 23:10:10,691) INFO - qlib.timer - [log.py:117] - Time cost: 15.192s | DDB query: Done
[2739486:MainThread](2026-02-26 23:10:11,588) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:11:37,528) INFO - qlib.timer - [log.py:117] - Time cost: 82.525s | Instruments filter: Done
[2739486:MainThread](2026-02-26 23:11:38,259) INFO - qlib.timer - [log.py:117] - Time cost: 127.029s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7185777e3e90>) Done
[2739486:MainThread](2026-02-26 23:11:38,260) INFO - qlib.timer - [log.py:117] - Time cost: 127.030s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7185777e3e90>) Done
[2739486:MainThread](2026-02-26 23:11:38,261) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2739486:MainThread](2026-02-26 23:11:39,000) INFO - qlib.timer - [log.py:117] - Time cost: 0.738s | fetch_df_by_index Done
[2739486:MainThread](2026-02-26 23:11:39,009) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
turnover free_turnover log_size con_rating_strength
datetime instrument
2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
SH600004 0.9386 1.9173 15.039255 0.8125
SH600006 0.2566 0.6479 13.680836 1.0000
SH600007 0.1647 0.8513 14.335590 0.7500
SH600008 0.1813 0.3362 14.435625 0.6875
... ... ... ... ...
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
SZ301662 12.5950 12.5950 12.681215 1.0000
SZ301665 14.0077 14.0077 11.719415 1.0000
SZ301678 6.6518 6.6518 12.799973 0.7500
SZ302132 1.3868 3.0296 15.359885 0.8750
[7623255 rows x 4 columns]
[2739486:MainThread](2026-02-26 23:11:39,010) INFO - qlib.timer - [log.py:117] - Time cost: 0.749s | Fetching dataframe Done
[2739486:MainThread](2026-02-26 23:11:39,191) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
[2739486:MainThread](2026-02-26 23:12:05,839) INFO - qlib.timer - [log.py:117] - Time cost: 26.825s | DDB query: Done
[2739486:MainThread](2026-02-26 23:12:06,554) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:12:07,075) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
[2739486:MainThread](2026-02-26 23:12:32,695) INFO - qlib.timer - [log.py:117] - Time cost: 25.629s | DDB query: Done
[2739486:MainThread](2026-02-26 23:12:33,566) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:14:02,232) INFO - qlib.timer - [log.py:117] - Time cost: 85.158s | Instruments filter: Done
[2739486:MainThread](2026-02-26 23:14:03,155) INFO - qlib.timer - [log.py:117] - Time cost: 144.143s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x718478991880>) Done
[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 144.144s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x718478991880>) Done
[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2739486:MainThread](2026-02-26 23:14:04,046) INFO - qlib.timer - [log.py:117] - Time cost: 0.889s | fetch_df_by_index Done
[2739486:MainThread](2026-02-26 23:14:04,060) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
IsZt IsDt IsN ... open_stop close_stop high_stop
datetime instrument ...
2019-11-25 SH600000 False False False ... False False False
SH600004 False False False ... False False False
SH600006 False False False ... False False False
SH600007 False False False ... False False False
SH600008 False False False ... False False False
... ... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False ... False False False
SZ301662 False False False ... False False False
SZ301665 False False False ... False False False
SZ301678 False False False ... False False False
SZ302132 False False False ... False False False
[6925320 rows x 12 columns]
[2739486:MainThread](2026-02-26 23:14:04,061) INFO - qlib.timer - [log.py:117] - Time cost: 0.904s | Fetching dataframe Done
[2739486:MainThread](2026-02-26 23:14:04,079) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
[2739486:MainThread](2026-02-26 23:14:06,440) INFO - qlib.timer - [log.py:117] - Time cost: 2.370s | DDB query: Done
[2739486:MainThread](2026-02-26 23:14:06,448) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:15:23,146) INFO - qlib.timer - [log.py:117] - Time cost: 76.695s | Instruments filter: Done
[2739486:MainThread](2026-02-26 23:15:23,184) INFO - qlib.timer - [log.py:117] - Time cost: 79.120s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x7184782fa0c0>) Done
[2739486:MainThread](2026-02-26 23:15:23,185) INFO - qlib.timer - [log.py:117] - Time cost: 79.121s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x7184782fa0c0>) Done
[2739486:MainThread](2026-02-26 23:15:23,186) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2739486:MainThread](2026-02-26 23:15:23,190) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
datetime instrument ...
2026-02-09 SH600000 False False ... False False
SH600004 False False ... False False
SH600006 False False ... False False
SH600007 False False ... False False
SH600008 False False ... False False
... ... ... ... ... ...
2026-02-26 SZ301658 False False ... False False
SZ301662 False False ... False False
SZ301665 False False ... False False
SZ301678 False False ... False False
SZ302132 False False ... False False
[41168 rows x 30 columns]
[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
[2739486:MainThread](2026-02-26 23:15:23,226) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
[2739486:MainThread](2026-02-26 23:15:53,388) INFO - qlib.timer - [log.py:117] - Time cost: 30.171s | DDB query: Done
[2739486:MainThread](2026-02-26 23:15:54,166) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00

@ -1,104 +0,0 @@
[2745445:MainThread](2026-02-26 23:18:06,410) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2745445:MainThread](2026-02-26 23:18:06,414) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2745445:MainThread](2026-02-26 23:18:06,415) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: ../data/
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading raw data from Qlib pipeline...
Loading raw data from handler (load_start=2019-12-13)...
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{[2745445:MainThread](2026-02-26 23:18:06,436) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2745445:MainThread](2026-02-26 23:18:06,437) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2745445:MainThread](2026-02-26 23:18:06,492) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX

@ -1,103 +0,0 @@
[2746177:MainThread](2026-02-26 23:21:56,618) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2746177:MainThread](2026-02-26 23:21:56,622) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2746177:MainThread](2026-02-26 23:21:56,623) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: ../data/
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading raw data from Qlib pipeline...
Loading raw data from handler (load_start=2019-12-13)...
Filtering instruments: ['SH600000', 'SH600004', 'SH600006', 'SH600007', 'SH600008']... (5 total)
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'}[2746177:MainThread](2026-02-26 23:21:56,647) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2746177:MainThread](2026-02-26 23:21:56,648) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2746177:MainThread](2026-02-26 23:21:56,716) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX

@ -1,187 +0,0 @@
#!/usr/bin/env python
"""
Verify feature column order between standalone pipeline and qlib gold standard.
This script:
1. Loads a small sample using the qlib pipeline
2. Runs the same sample through the standalone generate_beta_embedding pipeline
3. Compares the column order and feature values
"""
import pickle as pkl
import ruamel.yaml as yaml
import pandas as pd
import polars as pl
import numpy as np
import sys
import os
# Patch yaml.safe_load for compatibility
_yaml = yaml.YAML(typ='safe', pure=True)
def patched_safe_load(stream):
import io
if isinstance(stream, str):
stream = io.StringIO(stream)
return _yaml.load(stream)
yaml.safe_load = patched_safe_load
# Add scripts directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'scripts'))
def main():
print("=" * 70)
print("VERIFY FEATURE ORDER: Standalone vs Qlib Gold Standard")
print("=" * 70)
# Step 1: Load processor list
print("\nStep 1: Loading processor list...")
proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
with open(proc_path, "rb") as f:
proc_list = pkl.load(f)
print(f" Loaded {len(proc_list)} processors")
# Step 2: Load small sample from qlib pipeline
print("\nStep 2: Loading sample from qlib pipeline...")
import qlib
from qlib.config import REG_CN
qlib.init(provider_uri='/home/guofu/.qlib/data_ops/target', region=REG_CN)
from qlib.workflow.cli import sys_config
from qlib.utils import fill_placeholder
import datetime as dt
yaml_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml"
with open(yaml_path) as fin:
config = yaml.safe_load(fin)
sys_config(config, "qlib.contrib.data.config")
qlib.init(**config.get("qlib_init"))
load_start = pd.to_datetime("2020-01-02") - dt.timedelta(days=20)
placehorder_value = {
"<SINCE_DATE>": load_start,
"<TODAY>": dt.date.today()
}
config_filled = fill_placeholder(config, placehorder_value)
handler = qlib.init_instance_by_config(config_filled["handler"])
handler_data = handler._data
# Get data from SepDataFrame
if hasattr(handler_data, '_data'):
df_dict = handler_data._data
print(f" Handler groups: {list(df_dict.keys())}")
# Concatenate groups
raw_dfs = []
for group, df in df_dict.items():
df_copy = df.copy()
df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
raw_dfs.append(df_copy)
print(f" {group}: {len(df_copy.columns)} columns")
raw_df = pd.concat(raw_dfs, axis=1)
print(f" Raw concatenated shape: {raw_df.shape}")
# Step 3: Apply processors to get gold standard features
print("\nStep 3: Applying processors (qlib gold standard)...")
from qlib.contrib.data.utils import apply_proc_list
# Strip group prefixes for processor application
col_mapping = {col: col.split('::', 1)[1] for col in raw_df.columns if '::' in col}
raw_df_stripped = raw_df.rename(columns=col_mapping)
# Convert bool to object for processor compatibility
bool_cols = raw_df_stripped.select_dtypes(include=['bool']).columns
for col in bool_cols:
raw_df_stripped[col] = raw_df_stripped[col].astype(object)
df_gold = apply_proc_list(raw_df_stripped, proc_list=proc_list, with_fit=False)
print(f" Gold standard shape after processors: {df_gold.shape}")
# Restore group prefixes
reverse_mapping = {v: k for k, v in col_mapping.items()}
df_gold = df_gold.rename(columns=reverse_mapping)
# Get gold standard column order
gold_columns = list(df_gold.columns)
print(f"\nGold standard column groups:")
feature_cols = [c for c in gold_columns if c.startswith('feature::')]
feature_ext_cols = [c for c in gold_columns if c.startswith('feature_ext::')]
feature_flag_cols = [c for c in gold_columns if c.startswith('feature_flag::')]
indus_idx_cols = [c for c in gold_columns if c.startswith('indus_idx::')]
print(f" feature:: {len(feature_cols)} cols")
print(f" feature_ext:: {len(feature_ext_cols)} cols")
print(f" feature_flag:: {len(feature_flag_cols)} cols")
print(f" indus_idx:: {len(indus_idx_cols)} cols")
# Step 4: Now run standalone pipeline on same data
print("\nStep 4: Running standalone pipeline...")
# Load parquet data for same date range
from generate_beta_embedding import load_all_data, merge_data_sources, apply_feature_pipeline
df_alpha, df_kline, df_flag, df_industry = load_all_data("2020-01-02", "2020-01-10")
df_standalone = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
print(f" Standalone loaded shape: {df_standalone.shape}")
# Apply feature pipeline
df_processed, feature_cols_standalone = apply_feature_pipeline(df_standalone)
print(f" Standalone processed shape: {df_processed.shape}")
print(f" Standalone feature columns: {len(feature_cols_standalone)}")
# Step 5: Compare column counts
print("\n" + "=" * 70)
print("COMPARISON SUMMARY")
print("=" * 70)
print(f"\nGold standard total columns: {len(gold_columns)}")
print(f" feature:: {len(feature_cols)}")
print(f" feature_ext:: {len(feature_ext_cols)}")
print(f" feature_flag:: {len(feature_flag_cols)}")
print(f" indus_idx:: {len(indus_idx_cols)}")
print(f"\nStandalone feature columns: {len(feature_cols_standalone)}")
# The gold standard columns (without prefix) should match standalone
gold_feature_cols = [c.split('::', 1)[1] for c in feature_cols]
gold_feature_ext_cols = [c.split('::', 1)[1] for c in feature_ext_cols]
gold_feature_flag_cols = [c.split('::', 1)[1] for c in feature_flag_cols]
gold_indus_idx_cols = [c.split('::', 1)[1] for c in indus_idx_cols]
gold_all = gold_feature_cols + gold_feature_ext_cols + gold_feature_flag_cols + gold_indus_idx_cols
print(f"\nGold standard (flat): {len(gold_all)} features")
print(f"Standalone: {len(feature_cols_standalone)} features")
if len(gold_all) != len(feature_cols_standalone):
print(f"\nWARNING: Feature count mismatch! Difference: {len(gold_all) - len(feature_cols_standalone)}")
# Check column order
print("\nFirst 20 column comparison:")
print(f"{'Idx':<5} {'Gold Standard':<40} {'Standalone':<40} {'Match':<6}")
print("-" * 90)
for i in range(min(20, len(gold_all), len(feature_cols_standalone))):
match = "" if gold_all[i] == feature_cols_standalone[i] else ""
print(f"{i:<5} {gold_all[i]:<40} {feature_cols_standalone[i]:<40} {match:<6}")
# Check if orders match
if gold_all == feature_cols_standalone:
print("\n✓ Column order MATCHES!")
else:
print("\n✗ Column order DOES NOT MATCH!")
print("\nFinding differences...")
diff_count = 0
for i in range(min(len(gold_all), len(feature_cols_standalone))):
if gold_all[i] != feature_cols_standalone[i]:
diff_count += 1
if diff_count <= 20:
print(f" [{i}] Gold: {gold_all[i]} vs Standalone: {feature_cols_standalone[i]}")
print(f"Total differences: {diff_count}")
if __name__ == "__main__":
main()
Loading…
Cancel
Save