Remove bug analysis documentation (findings incorporated into README.md): - BUG_ANALYSIS.md, BUG_ANALYSIS_FINAL.md Remove one-off debug/exploration scripts: - compare_gold_standard.py, debug_data_divergence.py - verify_feature_order.py, regenerate_sample_embedding.py - dump_qlib_gold_standard.py, dump_qlib_gold_standard_simple.py Remove temporary log files and empty __pycache__ directories Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>master
parent
ea011090f8
commit
26a694298d
@ -1,293 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compare generated embeddings with database embeddings (0_7 version).
|
||||
Handles format conversion for datetime and instrument columns.
|
||||
|
||||
SUMMARY OF FINDINGS:
|
||||
- Generated embeddings and database embeddings have DIFFERENT values
|
||||
- Instrument mapping: 430xxx -> SHxxxxx, 830xxx -> SZxxxxx, 6xxxxx -> SH6xxxxx
|
||||
- Correlation between corresponding dimensions: ~0.0067 (essentially zero)
|
||||
- The generated embeddings are NOT the same as the database 0_7 embeddings
|
||||
- Possible reasons:
|
||||
1. Different model weights/versions used for generation
|
||||
2. Different input features or normalization
|
||||
3. Different random seed or inference configuration
|
||||
"""
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
def instrument_int_to_code(inst_int: int) -> str:
|
||||
"""Convert integer instrument code to exchange-prefixed string.
|
||||
|
||||
The encoding in the embedding file uses:
|
||||
- 4xxxxx -> SHxxxxxx (Shanghai A-shares, but code mapping is non-trivial)
|
||||
- 8xxxxx -> SZxxxxxx (Shenzhen A-shares)
|
||||
- Direct 6-digit codes are also present (600xxx, 000xxx, 300xxx)
|
||||
|
||||
Note: The exact mapping from 430017 -> SH600021 requires the original
|
||||
features file. We attempt an approximate mapping here.
|
||||
"""
|
||||
inst_str = str(inst_int)
|
||||
|
||||
# Already 6-digit code
|
||||
if len(inst_str) == 6 and inst_str[0] not in ('4', '8'):
|
||||
if inst_str.startswith('6'):
|
||||
return f"SH{inst_str}"
|
||||
else:
|
||||
return f"SZ{inst_str}"
|
||||
|
||||
# 6-digit with exchange prefix (4=SH, 8=SZ)
|
||||
if len(inst_str) == 6 and inst_str[0] in ('4', '8'):
|
||||
exchange = 'SH' if inst_str[0] == '4' else 'SZ'
|
||||
# The mapping from 430xxx -> 600xxx is not 1:1
|
||||
# Return the code as-is for matching attempts
|
||||
return f"{exchange}{inst_str[1:]}"
|
||||
|
||||
return inst_str
|
||||
|
||||
def load_generated_embedding(date_int: int, sample_n: int = None):
|
||||
"""Load generated embedding for a specific date."""
|
||||
gen_path = Path('/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/embedding_0_7_beta.parquet')
|
||||
|
||||
lf = pl.scan_parquet(gen_path)
|
||||
lf = lf.filter(pl.col('datetime') == date_int)
|
||||
|
||||
if sample_n:
|
||||
lf = lf.head(sample_n)
|
||||
|
||||
df = lf.collect()
|
||||
|
||||
# Convert wide format (embedding_0, embedding_1, ...) to list format
|
||||
embedding_cols = [c for c in df.columns if c.startswith('embedding_')]
|
||||
embedding_cols.sort(key=lambda x: int(x.split('_')[1]))
|
||||
|
||||
embedding_structs = df.select(embedding_cols).to_struct()
|
||||
embeddings_list = [[v for v in struct.values()] for struct in embedding_structs]
|
||||
|
||||
df = df.with_columns([
|
||||
pl.Series('values', embeddings_list),
|
||||
pl.col('datetime').cast(pl.UInt32).alias('datetime_uint32'),
|
||||
pl.col('instrument').alias('instrument_orig'),
|
||||
pl.col('instrument').cast(pl.String).alias('instrument_str'),
|
||||
pl.col('instrument').map_elements(instrument_int_to_code, return_dtype=pl.String).alias('instrument_code')
|
||||
])
|
||||
|
||||
return df
|
||||
|
||||
def load_database_embedding(date_str: str):
|
||||
"""Load database embedding for a specific date."""
|
||||
db_path = Path(f'/data/parquet/dataset/dwm_1day_multicast_csencode_1D/version=csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/datetime={date_str}/0.parquet')
|
||||
|
||||
if not db_path.exists():
|
||||
return None
|
||||
|
||||
df = pl.read_parquet(db_path)
|
||||
df = df.with_columns([
|
||||
pl.col('datetime').cast(pl.Int64).alias('datetime_int')
|
||||
])
|
||||
return df
|
||||
|
||||
def analyze_instrument_mapping(date_int: int):
|
||||
"""Analyze the instrument mapping between generated and database embeddings."""
|
||||
date_str = str(date_int)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Analyzing instrument mapping for date: {date_int}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
gen_df = load_generated_embedding(date_int)
|
||||
db_df = load_database_embedding(date_str)
|
||||
|
||||
if db_df is None:
|
||||
print(f"ERROR: Database embedding not found for {date_str}")
|
||||
return
|
||||
|
||||
print(f"\nGenerated embeddings: {gen_df.shape[0]} rows")
|
||||
print(f"Database embeddings: {db_df.shape[0]} rows")
|
||||
|
||||
# Show samples
|
||||
print("\n--- Generated Embedding Sample ---")
|
||||
sample_gen = gen_df.select(['datetime', 'instrument_orig', 'instrument_str', 'instrument_code', 'values']).head(10)
|
||||
print(sample_gen)
|
||||
|
||||
print("\n--- Database Embedding Sample ---")
|
||||
print(db_df.head(10))
|
||||
|
||||
# Try different matching strategies
|
||||
gen_insts_set = set(gen_df['instrument_code'].to_list())
|
||||
db_insts_set = set(db_df['instrument'].to_list())
|
||||
|
||||
common = gen_insts_set & db_insts_set
|
||||
gen_only = gen_insts_set - db_insts_set
|
||||
db_only = db_insts_set - gen_insts_set
|
||||
|
||||
print(f"\n--- Matching Results (with code conversion) ---")
|
||||
print(f"Common instruments: {len(common)}")
|
||||
print(f"Generated only: {len(gen_only)}")
|
||||
print(f"Database only: {len(db_only)}")
|
||||
|
||||
if len(common) == 0:
|
||||
print("\nNo common instruments found with code conversion!")
|
||||
print("\nTrying to find mapping patterns...")
|
||||
|
||||
# Show some samples for analysis
|
||||
print("\nGenerated instrument samples (original, converted):")
|
||||
gen_samples = list(zip(gen_df['instrument_orig'].head(20).to_list(),
|
||||
gen_df['instrument_code'].head(20).to_list()))
|
||||
for orig, conv in gen_samples:
|
||||
print(f" {orig} -> {conv}")
|
||||
|
||||
print("\nDatabase instrument samples:")
|
||||
db_samples = db_df['instrument'].head(20).to_list()
|
||||
for inst in db_samples:
|
||||
print(f" {inst}")
|
||||
|
||||
# Check if there's a position-based alignment possible
|
||||
# Sort both and compare by position
|
||||
gen_sorted = sorted(gen_df['instrument_orig'].to_list())
|
||||
db_sorted = sorted([int(inst[2:]) for inst in db_df['instrument'].to_list()])
|
||||
|
||||
print("\n--- Attempting position-based matching ---")
|
||||
print(f"Generated sorted (first 10): {gen_sorted[:10]}")
|
||||
print(f"Database sorted (first 10): {db_sorted[:10]}")
|
||||
|
||||
else:
|
||||
# We have matches, compare embeddings
|
||||
print(f"\n--- Comparing embeddings for {len(common)} common instruments ---")
|
||||
|
||||
gen_common = gen_df.filter(pl.col('instrument_code').is_in(list(common)))
|
||||
db_common = db_df.filter(pl.col('instrument').is_in(list(common)))
|
||||
|
||||
# Join and compare
|
||||
comparison = gen_common.join(
|
||||
db_common,
|
||||
left_on='instrument_code',
|
||||
right_on='instrument',
|
||||
how='inner',
|
||||
suffix='_db'
|
||||
)
|
||||
|
||||
# Calculate differences
|
||||
diffs = []
|
||||
for row in comparison.iter_rows():
|
||||
# Find indices for the values columns
|
||||
gen_vals_idx = comparison.columns.index('values')
|
||||
db_vals_idx = comparison.columns.index('values_db')
|
||||
|
||||
gen_emb = np.array(row[gen_vals_idx])
|
||||
db_emb = np.array(row[db_vals_idx])
|
||||
|
||||
diff = gen_emb - db_emb
|
||||
diff_norm = np.linalg.norm(diff)
|
||||
rel_diff = diff_norm / (np.linalg.norm(db_emb) + 1e-10)
|
||||
|
||||
diffs.append({
|
||||
'instrument': row[comparison.columns.index('instrument_code')],
|
||||
'l2_norm_diff': diff_norm,
|
||||
'relative_diff': rel_diff,
|
||||
'max_abs_diff': np.max(np.abs(diff)),
|
||||
'gen_emb_norm': np.linalg.norm(gen_emb),
|
||||
'db_emb_norm': np.linalg.norm(db_emb)
|
||||
})
|
||||
|
||||
if diffs:
|
||||
diff_df = pl.DataFrame(diffs)
|
||||
print("\nDifference statistics:")
|
||||
print(diff_df.select(['l2_norm_diff', 'relative_diff', 'max_abs_diff']).describe())
|
||||
|
||||
max_rel_diff = diff_df['relative_diff'].max()
|
||||
print(f"\nMax relative difference: {max_rel_diff:.6e}")
|
||||
|
||||
if max_rel_diff < 1e-5:
|
||||
print("✓ Embeddings match within numerical precision!")
|
||||
elif max_rel_diff < 0.01:
|
||||
print("~ Embeddings are very similar")
|
||||
else:
|
||||
print("✗ Embeddings differ significantly")
|
||||
|
||||
# Show some comparison samples
|
||||
print("\nSample comparison:")
|
||||
for i in range(min(5, len(diffs))):
|
||||
d = diffs[i]
|
||||
print(f" {d['instrument']}: gen_norm={d['gen_emb_norm']:.4f}, "
|
||||
f"db_norm={d['db_emb_norm']:.4f}, rel_diff={d['relative_diff']:.6e}")
|
||||
|
||||
def calculate_correlation(date_int: int):
|
||||
"""Calculate correlation between generated and database embeddings."""
|
||||
import numpy as np
|
||||
|
||||
date_str = str(date_int)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Correlation Analysis for date: {date_int}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
gen_df = load_generated_embedding(date_int)
|
||||
db_df = load_database_embedding(date_str)
|
||||
|
||||
if db_df is None:
|
||||
print(f"ERROR: Database embedding not found for {date_str}")
|
||||
return
|
||||
|
||||
# Find common instruments
|
||||
gen_insts = set(gen_df['instrument_code'].to_list())
|
||||
db_insts = set(db_df['instrument'].to_list())
|
||||
common = list(gen_insts & db_insts)
|
||||
|
||||
print(f"\nCommon instruments: {len(common)}")
|
||||
|
||||
if len(common) == 0:
|
||||
print("No common instruments found!")
|
||||
return
|
||||
|
||||
# Filter to common and sort
|
||||
gen_common = gen_df.filter(pl.col('instrument_code').is_in(common)).sort('instrument_code')
|
||||
db_common = db_df.filter(pl.col('instrument').is_in(common)).sort('instrument')
|
||||
|
||||
# Extract embedding matrices
|
||||
gen_embs = np.array(gen_common['values'].to_list())
|
||||
db_embs = np.array(db_common['values'].to_list())
|
||||
|
||||
print(f"Generated embeddings shape: {gen_embs.shape}")
|
||||
print(f"Database embeddings shape: {db_embs.shape}")
|
||||
|
||||
# Calculate correlation per dimension
|
||||
correlations = []
|
||||
for i in range(32):
|
||||
gen_dim = gen_embs[:, i]
|
||||
db_dim = db_embs[:, i]
|
||||
corr = np.corrcoef(gen_dim, db_dim)[0, 1]
|
||||
correlations.append(corr)
|
||||
|
||||
print(f"\nCorrelation statistics across 32 dimensions:")
|
||||
print(f" Mean: {np.mean(correlations):.4f}")
|
||||
print(f" Median: {np.median(correlations):.4f}")
|
||||
print(f" Min: {np.min(correlations):.4f}")
|
||||
print(f" Max: {np.max(correlations):.4f}")
|
||||
|
||||
# Overall correlation
|
||||
overall_corr = np.corrcoef(gen_embs.flatten(), db_embs.flatten())[0, 1]
|
||||
print(f"\nOverall correlation (all dims flattened): {overall_corr:.4f}")
|
||||
|
||||
# Interpretation
|
||||
mean_corr = np.mean(correlations)
|
||||
if abs(mean_corr) < 0.1:
|
||||
print("\n✗ CONCLUSION: Embeddings are NOT correlated (essentially independent)")
|
||||
elif abs(mean_corr) < 0.5:
|
||||
print("\n~ CONCLUSION: Weak correlation between embeddings")
|
||||
else:
|
||||
print(f"\n✓ CONCLUSION: {'Strong' if abs(mean_corr) > 0.8 else 'Moderate'} correlation")
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Analyze for a few dates
|
||||
dates_to_compare = [20190102, 20200102, 20240102]
|
||||
|
||||
for date in dates_to_compare:
|
||||
try:
|
||||
analyze_instrument_mapping(date)
|
||||
calculate_correlation(date)
|
||||
except Exception as e:
|
||||
print(f"\nError analyzing date {date}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
@ -0,0 +1,18 @@
|
||||
# CTA 1D Experiment Results
|
||||
|
||||
Document experiments manually here.
|
||||
|
||||
## Template
|
||||
|
||||
```markdown
|
||||
## YYYY-MM-DD: Experiment Name
|
||||
- Notebook: `../cta_1d/XX_notebook.ipynb` (cell range)
|
||||
- Data: [dates]
|
||||
- Config: key parameters
|
||||
- Metrics: IC mean/std, returns, sharpe
|
||||
- Notes: observations, next steps
|
||||
```
|
||||
|
||||
## Experiments
|
||||
|
||||
*Add entries below as you run experiments*
|
||||
@ -0,0 +1,18 @@
|
||||
# Stock 15m Experiment Results
|
||||
|
||||
Document experiments manually here.
|
||||
|
||||
## Template
|
||||
|
||||
```markdown
|
||||
## YYYY-MM-DD: Experiment Name
|
||||
- Notebook: `../stock_15m/XX_notebook.ipynb` (cell range)
|
||||
- Data: [dates]
|
||||
- Config: key parameters
|
||||
- Metrics: IC mean/std, returns, sharpe
|
||||
- Notes: observations, next steps
|
||||
```
|
||||
|
||||
## Experiments
|
||||
|
||||
*Add entries below as you run experiments*
|
||||
@ -1,123 +0,0 @@
|
||||
# Data Pipeline Bug Analysis
|
||||
|
||||
## Summary
|
||||
|
||||
The generated embeddings do not match the database 0_7 embeddings due to multiple bugs in the data pipeline migration from qlib to standalone Polars implementation.
|
||||
|
||||
---
|
||||
|
||||
## Bugs Fixed
|
||||
|
||||
### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
|
||||
|
||||
**Original (incorrect):**
|
||||
```python
|
||||
market_0 = (instrument >= 600000) # SH
|
||||
market_1 = (instrument < 600000) # SZ
|
||||
```
|
||||
|
||||
**Fixed:**
|
||||
```python
|
||||
inst_str = str(instrument).zfill(6)
|
||||
market_0 = inst_str.startswith('6') # SH: 6xxxxx
|
||||
market_1 = inst_str.startswith('0') | inst_str.startswith('3') # SZ: 0xxx, 3xxx
|
||||
market_2 = inst_str.startswith('4') | inst_str.startswith('8') # NE: 4xxx, 8xxx
|
||||
```
|
||||
|
||||
**Impact:** 167 instruments (4xxxxx, 8xxxxx - 新三板) were misclassified.
|
||||
|
||||
---
|
||||
|
||||
### 2. ColumnRemover Missing `IsN` ✓ FIXED
|
||||
|
||||
**Original (incorrect):**
|
||||
```python
|
||||
columns_to_remove = ['TotalValue_diff', 'IsZt', 'IsDt']
|
||||
```
|
||||
|
||||
**Fixed:**
|
||||
```python
|
||||
columns_to_remove = ['TotalValue_diff', 'IsN', 'IsZt', 'IsDt']
|
||||
```
|
||||
|
||||
**Impact:** Extra column caused feature dimension mismatch.
|
||||
|
||||
---
|
||||
|
||||
### 3. RobustZScoreNorm Applied to Wrong Columns ✓ FIXED
|
||||
|
||||
**Original (incorrect):**
|
||||
Applied normalization to ALL 341 features including market flags and indus_idx.
|
||||
|
||||
**Fixed:**
|
||||
Only normalize `alpha158 + alpha158_ntrl + market_ext + market_ext_ntrl` (330 features), excluding:
|
||||
- Market flags (Limit, Stopping, IsTp, IsXD, IsXR, IsDR, market_0, market_1, market_2, IsST)
|
||||
- indus_idx
|
||||
|
||||
---
|
||||
|
||||
## Critical Remaining Issue: Data Schema Mismatch
|
||||
|
||||
### `Limit` and `Stopping` Column Types Changed
|
||||
|
||||
**Original qlib pipeline expected:**
|
||||
- `Limit`: **Boolean** flag (True = limit up)
|
||||
- `Stopping`: **Boolean** flag (True = suspended trading)
|
||||
|
||||
**Current Parquet data has:**
|
||||
- `Limit`: **Float64** price change percentage (0.0 to 1301.3)
|
||||
- `Stopping`: **Float64** price change percentage
|
||||
|
||||
**Evidence:**
|
||||
```
|
||||
Limit values sample: [8.86, 9.36, 31.0, 7.32, 2.28, 6.39, 5.38, 4.03, 3.86, 9.89]
|
||||
Limit == 0: only 2 rows
|
||||
Limit > 0: 3738 rows
|
||||
```
|
||||
|
||||
This is a **fundamental data schema change**. The current Parquet files contain different data than what the original VAE model was trained on.
|
||||
|
||||
**Possible fixes:**
|
||||
1. Convert `Limit` and `Stopping` to boolean flags using a threshold
|
||||
2. Find the original data source that had boolean flags
|
||||
3. Re-train the VAE model with the new data schema
|
||||
|
||||
---
|
||||
|
||||
## Correlation Results
|
||||
|
||||
After fixing bugs 1-3, the embedding correlation with database 0_7:
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Mean correlation (32 dims) | 0.0068 |
|
||||
| Median correlation | 0.0094 |
|
||||
| Overall correlation | 0.2330 |
|
||||
|
||||
**Conclusion:** Embeddings remain essentially uncorrelated (≈0).
|
||||
|
||||
---
|
||||
|
||||
## Root Cause
|
||||
|
||||
The **Limit/Stopping data schema change** is the most likely root cause. The VAE model learned to encode features that included binary limit/stopping flags, but the standalone pipeline feeds it continuous price change percentages instead.
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Verify original data schema:**
|
||||
- Check if the original DolphinDB table had boolean `Limit` and `Stopping` columns
|
||||
- Compare with the current Parquet schema
|
||||
|
||||
2. **Fix the data loading:**
|
||||
- Either convert continuous values to binary flags
|
||||
- Or use the correct boolean columns (`IsZt`, `IsDt`) for limit flags
|
||||
|
||||
3. **Verify feature order:**
|
||||
- Ensure the qlib RobustZScoreNorm parameters are applied in the correct order
|
||||
- Check that `[alpha158, alpha158_ntrl, market_ext, market_ext_ntrl]` matches the 330-parameter shape
|
||||
|
||||
4. **Re-run comparison:**
|
||||
- Generate new embeddings with the corrected pipeline
|
||||
- Compare correlation with database
|
||||
@ -1,159 +0,0 @@
|
||||
# Data Pipeline Bug Analysis - Final Status
|
||||
|
||||
## Summary
|
||||
|
||||
After fixing all identified bugs, the feature count now matches (341), but the embeddings remain uncorrelated with the database 0_7 version.
|
||||
|
||||
**Latest Version**: v6
|
||||
- Feature count: 341 ✓ (matches VAE input dim)
|
||||
- Mean correlation with DB: 0.0050 (essentially zero)
|
||||
- Status: All identified bugs fixed, IsST issue documented
|
||||
- **New**: Polars-based dataset generation script added (`scripts/dump_polars_dataset.py`)
|
||||
|
||||
---
|
||||
|
||||
## Bugs Fixed
|
||||
|
||||
### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
|
||||
- **Bug**: Used `instrument >= 600000` which misclassified 新三板 instruments
|
||||
- **Fix**: Use string prefix matching with vocab_size=2 (not 3)
|
||||
- **Impact**: 167 instruments corrected
|
||||
|
||||
### 2. ColumnRemover Missing `IsN` ✓ FIXED
|
||||
- **Bug**: Only removed `IsZt, IsDt` but not `IsN`
|
||||
- **Fix**: Added `IsN` to removal list
|
||||
- **Impact**: Feature count alignment
|
||||
|
||||
### 3. RobustZScoreNorm Scope ✓ FIXED
|
||||
- **Bug**: Applied normalization to all 341 features
|
||||
- **Fix**: Only normalize 330 features (alpha158 + market_ext, both original + neutralized)
|
||||
- **Impact**: Correct normalization scope
|
||||
|
||||
### 4. Wrong Data Sources for Market Flags ✓ FIXED
|
||||
- **Bug**: Used `Limit, Stopping` (Float64) from kline_adjusted
|
||||
- **Fix**: Load from correct sources:
|
||||
- kline_adjusted: `IsZt, IsDt, IsN, IsXD, IsXR, IsDR` (Boolean)
|
||||
- market_flag: `open_limit, close_limit, low_limit, high_stop` (Boolean, 4 cols)
|
||||
- **Impact**: Correct boolean flag data
|
||||
|
||||
### 5. Feature Count Mismatch ✓ FIXED
|
||||
- **Bug**: 344 features (3 extra)
|
||||
- **Fix**: vocab_size=2 + 4 market_flag cols = 341 features
|
||||
- **Impact**: VAE input dimension matches
|
||||
|
||||
### 6. Fixed* Processors Not Adding Required Columns ✓ FIXED
|
||||
- **Bug**: `FixedFlagMarketInjector` only converted dtype but didn't add `market_0`, `market_1` columns
|
||||
- **Bug**: `FixedFlagSTInjector` only converted dtype but didn't create `IsST` column from `ST_S`, `ST_Y`
|
||||
- **Fix**:
|
||||
- `FixedFlagMarketInjector`: Now adds `market_0` (SH60xxx, SZ00xxx) and `market_1` (SH688xxx, SH689xxx, SZ300xxx, SZ301xxx)
|
||||
- `FixedFlagSTInjector`: Now creates `IsST = ST_S | ST_Y`
|
||||
- **Impact**: Processed data now has 408 columns (was 405), matching original qlib output
|
||||
|
||||
---
|
||||
|
||||
## Important Discovery: IsST Column Issue in Gold-Standard Code
|
||||
|
||||
### Problem Description
|
||||
|
||||
The `FlagSTInjector` processor in the original qlib proc_list is supposed to create an `IsST` column in the `feature_flag` group from the `ST_S` and `ST_Y` columns in the `st_flag` group. However, this processor **fails silently** even in the gold-standard qlib code.
|
||||
|
||||
### Root Cause
|
||||
|
||||
The `FlagSTInjector` processor attempts to access columns using a format that doesn't match the actual column structure in the data:
|
||||
|
||||
1. **Expected format**: The processor expects columns like `st_flag::ST_S` and `st_flag::ST_Y` (string format with `::` separator)
|
||||
2. **Actual format**: The qlib handler produces MultiIndex tuple columns like `('st_flag', 'ST_S')` and `('st_flag', 'ST_Y')`
|
||||
|
||||
This format mismatch causes the processor to fail to find the ST flag columns, and thus no `IsST` column is created.
|
||||
|
||||
### Evidence
|
||||
|
||||
```python
|
||||
# Check proc_list
|
||||
import pickle as pkl
|
||||
with open('proc_list.proc', 'rb') as f:
|
||||
proc_list = pkl.load(f)
|
||||
|
||||
# FlagSTInjector config
|
||||
flag_st = proc_list[2]
|
||||
print(f"fields_group: {flag_st.fields_group}") # 'feature_flag'
|
||||
print(f"col_name: {flag_st.col_name}") # 'IsST'
|
||||
print(f"st_group: {flag_st.st_group}") # 'st_flag'
|
||||
|
||||
# Check if IsST exists in processed data
|
||||
with open('processed_data.pkl', 'rb') as f:
|
||||
df = pkl.load(f)
|
||||
|
||||
feature_flag_cols = [c[1] for c in df.columns if c[0] == 'feature_flag']
|
||||
print('IsST' in feature_flag_cols) # False!
|
||||
```
|
||||
|
||||
### Impact
|
||||
|
||||
- **VAE training**: The VAE model was trained on data **without** the `IsST` column
|
||||
- **VAE input dimension**: 341 features (excluding IsST), not 342
|
||||
- **Polars pipeline**: Should also skip `IsST` to maintain compatibility
|
||||
|
||||
### Resolution
|
||||
|
||||
The polars-based pipeline (`dump_polars_dataset.py`) now correctly **skips** the `FlagSTInjector` step to match the gold-standard behavior:
|
||||
|
||||
```python
|
||||
# Step 3: FlagSTInjector - SKIPPED (fails even in gold-standard)
|
||||
print("[3] Skipping FlagSTInjector (as per gold-standard behavior)...")
|
||||
market_flag_with_st = market_flag_with_market # No IsST added
|
||||
```
|
||||
|
||||
### Lessons Learned
|
||||
|
||||
1. **Verify processor execution**: Don't assume all processors in the proc_list executed successfully. Check the output data to verify expected columns exist.
|
||||
|
||||
2. **Column format matters**: The qlib processors were designed for specific column formats (MultiIndex tuples vs `::` separator strings). Format mismatches can cause silent failures.
|
||||
|
||||
3. **Match the gold-standard bugs**: When replicating a pipeline, sometimes you need to replicate the bugs too. The VAE was trained on data without `IsST`, so our pipeline must also exclude it.
|
||||
|
||||
4. **Debug by comparing intermediate outputs**: Use scripts like `debug_data_divergence.py` to compare raw and processed data between the gold-standard and polars pipelines.
|
||||
|
||||
---
|
||||
|
||||
## Correlation Results (v5)
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Mean correlation (32 dims) | 0.0050 |
|
||||
| Median correlation | 0.0079 |
|
||||
| Min | -0.0420 |
|
||||
| Max | 0.0372 |
|
||||
| Overall (flattened) | 0.2225 |
|
||||
|
||||
**Conclusion**: Embeddings remain essentially uncorrelated with database.
|
||||
|
||||
---
|
||||
|
||||
## Possible Remaining Issues
|
||||
|
||||
1. **Different input data values**: The alpha158_0_7_beta Parquet files may contain different values than the original DolphinDB data used to train the VAE.
|
||||
|
||||
2. **Feature ordering mismatch**: The 330 RobustZScoreNorm parameters must be applied in the exact order:
|
||||
- [0:158] = alpha158 original
|
||||
- [158:316] = alpha158_ntrl
|
||||
- [316:323] = market_ext original (7 cols)
|
||||
- [323:330] = market_ext_ntrl (7 cols)
|
||||
|
||||
3. **Industry neutralization differences**: Our `IndusNtrlInjector` implementation may differ from qlib's.
|
||||
|
||||
4. **Missing transformations**: There may be additional preprocessing steps not captured in handler.yaml.
|
||||
|
||||
5. **VAE model mismatch**: The VAE model may have been trained with different data than what handler.yaml specifies.
|
||||
|
||||
---
|
||||
|
||||
## Recommended Next Steps
|
||||
|
||||
1. **Compare intermediate features**: Run both the qlib pipeline and our pipeline on the same input data and compare outputs at each step.
|
||||
|
||||
2. **Verify RobustZScoreNorm parameter order**: Check if our feature ordering matches the order used during VAE training.
|
||||
|
||||
3. **Compare predictions, not embeddings**: Instead of comparing VAE embeddings, compare the final d033 model predictions with the original 0_7 predictions.
|
||||
|
||||
4. **Check alpha158 data source**: Verify that `stg_1day_wind_alpha158_0_7_beta_1D` contains the same data as the original DolphinDB `stg_1day_wind_alpha158_0_7_beta` table.
|
||||
@ -1,129 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Compare generated embeddings with gold standard embeddings from DolphinDB.
|
||||
"""
|
||||
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "../data"
|
||||
|
||||
|
||||
def compare_embeddings():
|
||||
"""Compare generated and gold standard embeddings."""
|
||||
|
||||
# Load data
|
||||
gold_path = DATA_DIR / "embedding_0_7_beta_gold_standard.parquet"
|
||||
gen_path = DATA_DIR / "embedding_0_7_beta_sample.parquet"
|
||||
|
||||
print("=" * 60)
|
||||
print("Loading embeddings")
|
||||
print("=" * 60)
|
||||
|
||||
gold = pl.read_parquet(gold_path)
|
||||
gen = pl.read_parquet(gen_path)
|
||||
|
||||
print(f"Gold standard: {gold.shape}")
|
||||
print(f"Generated: {gen.shape}")
|
||||
|
||||
# Get embedding columns
|
||||
emb_cols = [f"embedding_{i}" for i in range(32)]
|
||||
|
||||
# Compare by date
|
||||
dates = sorted(gold["datetime"].unique().to_list())
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Comparison by date")
|
||||
print("=" * 60)
|
||||
|
||||
for dt in dates:
|
||||
gold_dt = gold.filter(pl.col("datetime") == dt)
|
||||
gen_dt = gen.filter(pl.col("datetime") == dt)
|
||||
|
||||
print(f"\nDate: {dt}")
|
||||
print(f" Gold instruments: {gold_dt.height}, Generated instruments: {gen_dt.height}")
|
||||
print(f" Gold instrument sample: {gold_dt['instrument'].head(5).to_list()}")
|
||||
print(f" Gen instrument sample: {gen_dt['instrument'].head(5).to_list()}")
|
||||
|
||||
# Check for common instruments
|
||||
gold_insts = set(gold_dt["instrument"].to_list())
|
||||
gen_insts = set(gen_dt["instrument"].to_list())
|
||||
common = gold_insts & gen_insts
|
||||
|
||||
print(f" Common instruments: {len(common)}")
|
||||
|
||||
if len(common) > 0:
|
||||
# Compare embeddings for common instruments
|
||||
gold_common = gold_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
|
||||
gen_common = gen_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
|
||||
|
||||
# Calculate embedding differences
|
||||
diffs = []
|
||||
for i in range(len(gold_common)):
|
||||
gold_emb = np.array([gold_common[col][i] for col in emb_cols])
|
||||
gen_emb = np.array([gen_common[col][i] for col in emb_cols])
|
||||
|
||||
diff = gold_emb - gen_emb
|
||||
l2_norm = np.linalg.norm(diff)
|
||||
rel_diff = l2_norm / (np.linalg.norm(gold_emb) + 1e-8)
|
||||
max_abs_diff = np.max(np.abs(diff))
|
||||
|
||||
diffs.append({
|
||||
"l2_norm": l2_norm,
|
||||
"rel_diff": rel_diff,
|
||||
"max_abs_diff": max_abs_diff,
|
||||
"gold_norm": np.linalg.norm(gold_emb),
|
||||
"gen_norm": np.linalg.norm(gen_emb)
|
||||
})
|
||||
|
||||
diff_df = pl.DataFrame(diffs)
|
||||
print(f"\n Embedding comparison:")
|
||||
print(f" Mean L2 norm diff: {diff_df['l2_norm'].mean():.4f}")
|
||||
print(f" Mean rel diff: {diff_df['rel_diff'].mean():.4%}")
|
||||
print(f" Mean max abs diff: {diff_df['max_abs_diff'].mean():.4f}")
|
||||
print(f" Gold emb norm (mean): {diff_df['gold_norm'].mean():.4f}")
|
||||
print(f" Gen emb norm (mean): {diff_df['gen_norm'].mean():.4f}")
|
||||
|
||||
# Correlation analysis
|
||||
gold_embs = np.array([[gold_common[col][i] for col in emb_cols] for i in range(len(gold_common))])
|
||||
gen_embs = np.array([[gen_common[col][i] for col in emb_cols] for i in range(len(gen_common))])
|
||||
|
||||
correlations = []
|
||||
for d in range(32):
|
||||
corr = np.corrcoef(gold_embs[:, d], gen_embs[:, d])[0, 1]
|
||||
correlations.append(corr)
|
||||
|
||||
print(f"\n Correlation by dimension:")
|
||||
print(f" Mean: {np.mean(correlations):.4f}")
|
||||
print(f" Median: {np.median(correlations):.4f}")
|
||||
print(f" Min: {np.min(correlations):.4f}")
|
||||
print(f" Max: {np.max(correlations):.4f}")
|
||||
|
||||
# Overall correlation
|
||||
overall_corr = np.corrcoef(gold_embs.flatten(), gen_embs.flatten())[0, 1]
|
||||
print(f" Overall (flattened): {overall_corr:.4f}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary Statistics")
|
||||
print("=" * 60)
|
||||
|
||||
# Gold standard stats
|
||||
gold_embs = gold.select(emb_cols).to_numpy()
|
||||
print("\nGold standard embeddings:")
|
||||
print(f" Mean: {np.mean(gold_embs):.6f}")
|
||||
print(f" Std: {np.std(gold_embs):.6f}")
|
||||
print(f" Min: {np.min(gold_embs):.6f}")
|
||||
print(f" Max: {np.max(gold_embs):.6f}")
|
||||
|
||||
# Generated stats
|
||||
gen_embs = gen.select(emb_cols).to_numpy()
|
||||
print("\nGenerated embeddings:")
|
||||
print(f" Mean: {np.mean(gen_embs):.6f}")
|
||||
print(f" Std: {np.std(gen_embs):.6f}")
|
||||
print(f" Min: {np.min(gen_embs):.6f}")
|
||||
print(f" Max: {np.max(gen_embs):.6f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
compare_embeddings()
|
||||
@ -1,254 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Debug script to compare gold-standard qlib data vs polars-based pipeline.
|
||||
|
||||
This script helps identify where the data loading and processing pipeline
|
||||
starts to diverge from the gold-standard qlib output.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pickle as pkl
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
|
||||
# Paths
|
||||
GOLD_RAW_PATH = "/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/raw_data_20190101_20190131.pkl"
|
||||
GOLD_PROC_PATH = "/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/processed_data_20190101_20190131.pkl"
|
||||
PROC_LIST_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
|
||||
|
||||
def compare_raw_data():
|
||||
"""Compare raw data from gold standard vs polars pipeline."""
|
||||
print("=" * 80)
|
||||
print("STEP 1: Compare RAW DATA (before proc_list)")
|
||||
print("=" * 80)
|
||||
|
||||
# Load gold standard raw data
|
||||
with open(GOLD_RAW_PATH, "rb") as f:
|
||||
gold_raw = pkl.load(f)
|
||||
|
||||
print(f"\nGold standard raw data:")
|
||||
print(f" Shape: {gold_raw.shape}")
|
||||
print(f" Index: {gold_raw.index.names}")
|
||||
print(f" Column groups: {gold_raw.columns.get_level_values(0).unique().tolist()}")
|
||||
|
||||
# Count columns per group
|
||||
for grp in gold_raw.columns.get_level_values(0).unique().tolist():
|
||||
count = (gold_raw.columns.get_level_values(0) == grp).sum()
|
||||
print(f" {grp}: {count} columns")
|
||||
|
||||
# Show sample values for key columns
|
||||
print("\n Sample values (first 3 rows):")
|
||||
for col in [('feature', 'KMID'), ('feature_ext', 'turnover'), ('feature_ext', 'log_size')]:
|
||||
if col in gold_raw.columns:
|
||||
print(f" {col}: {gold_raw[col].iloc[:3].tolist()}")
|
||||
|
||||
return gold_raw
|
||||
|
||||
|
||||
def compare_processed_data():
|
||||
"""Compare processed data from gold standard vs polars pipeline."""
|
||||
print("\n" + "=" * 80)
|
||||
print("STEP 2: Compare PROCESSED DATA (after proc_list)")
|
||||
print("=" * 80)
|
||||
|
||||
# Load gold standard processed data
|
||||
with open(GOLD_PROC_PATH, "rb") as f:
|
||||
gold_proc = pkl.load(f)
|
||||
|
||||
print(f"\nGold standard processed data:")
|
||||
print(f" Shape: {gold_proc.shape}")
|
||||
print(f" Index: {gold_proc.index.names}")
|
||||
print(f" Column groups: {gold_proc.columns.get_level_values(0).unique().tolist()}")
|
||||
|
||||
# Count columns per group
|
||||
for grp in gold_proc.columns.get_level_values(0).unique().tolist():
|
||||
count = (gold_proc.columns.get_level_values(0) == grp).sum()
|
||||
print(f" {grp}: {count} columns")
|
||||
|
||||
# Show sample values for key columns
|
||||
print("\n Sample values (first 3 rows):")
|
||||
for col in [('feature', 'KMID'), ('feature', 'KMID_ntrl'),
|
||||
('feature_ext', 'turnover'), ('feature_ext', 'turnover_ntrl')]:
|
||||
if col in gold_proc.columns:
|
||||
print(f" {col}: {gold_proc[col].iloc[:3].tolist()}")
|
||||
|
||||
return gold_proc
|
||||
|
||||
|
||||
def analyze_processor_pipeline(gold_raw, gold_proc):
|
||||
"""Analyze what transformations happened in the proc_list."""
|
||||
print("\n" + "=" * 80)
|
||||
print("STEP 3: Analyze Processor Transformations")
|
||||
print("=" * 80)
|
||||
|
||||
# Load proc_list
|
||||
with open(PROC_LIST_PATH, "rb") as f:
|
||||
proc_list = pkl.load(f)
|
||||
|
||||
print(f"\nProcessor pipeline ({len(proc_list)} processors):")
|
||||
for i, proc in enumerate(proc_list):
|
||||
print(f" [{i}] {type(proc).__name__}")
|
||||
|
||||
# Analyze column changes
|
||||
print("\nColumn count changes:")
|
||||
print(f" Before: {gold_raw.shape[1]} columns")
|
||||
print(f" After: {gold_proc.shape[1]} columns")
|
||||
print(f" Change: +{gold_proc.shape[1] - gold_raw.shape[1]} columns")
|
||||
|
||||
# Check which columns were added/removed
|
||||
gold_raw_cols = set(gold_raw.columns)
|
||||
gold_proc_cols = set(gold_proc.columns)
|
||||
|
||||
added_cols = gold_proc_cols - gold_raw_cols
|
||||
removed_cols = gold_raw_cols - gold_proc_cols
|
||||
|
||||
print(f"\n Added columns: {len(added_cols)}")
|
||||
print(f" Removed columns: {len(removed_cols)}")
|
||||
|
||||
if removed_cols:
|
||||
print(f" Removed: {list(removed_cols)[:10]}...")
|
||||
|
||||
# Check feature column patterns
|
||||
print("\nFeature column patterns in processed data:")
|
||||
feature_cols = [c for c in gold_proc.columns if c[0] == 'feature']
|
||||
ntrl_cols = [c for c in feature_cols if c[1].endswith('_ntrl')]
|
||||
raw_cols = [c for c in feature_cols if not c[1].endswith('_ntrl')]
|
||||
print(f" Total feature columns: {len(feature_cols)}")
|
||||
print(f" _ntrl columns: {len(ntrl_cols)}")
|
||||
print(f" raw columns: {len(raw_cols)}")
|
||||
|
||||
|
||||
def check_polars_pipeline():
|
||||
"""Run the polars-based pipeline and compare."""
|
||||
print("\n" + "=" * 80)
|
||||
print("STEP 4: Generate data using Polars pipeline")
|
||||
print("=" * 80)
|
||||
|
||||
try:
|
||||
from generate_beta_embedding import (
|
||||
load_all_data, merge_data_sources, apply_feature_pipeline,
|
||||
filter_stock_universe
|
||||
)
|
||||
|
||||
# Load data using polars pipeline
|
||||
print("\nLoading data with polars pipeline...")
|
||||
df_alpha, df_kline, df_flag, df_industry = load_all_data(
|
||||
"2019-01-01", "2019-01-31"
|
||||
)
|
||||
|
||||
print(f"\nPolars data sources loaded:")
|
||||
print(f" Alpha158: {df_alpha.shape}")
|
||||
print(f" Kline (market_ext): {df_kline.shape}")
|
||||
print(f" Flags: {df_flag.shape}")
|
||||
print(f" Industry: {df_industry.shape}")
|
||||
|
||||
# Merge
|
||||
df_merged = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
|
||||
print(f"\nAfter merge: {df_merged.shape}")
|
||||
|
||||
# Convert to pandas for easier comparison
|
||||
df_pandas = df_merged.to_pandas()
|
||||
df_pandas = df_pandas.set_index(['datetime', 'instrument'])
|
||||
|
||||
print(f"\nAfter converting to pandas MultiIndex: {df_pandas.shape}")
|
||||
|
||||
# Compare column names
|
||||
with open(GOLD_RAW_PATH, "rb") as f:
|
||||
gold_raw = pkl.load(f)
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("STEP 5: Compare Column Names (Gold vs Polars)")
|
||||
print("=" * 80)
|
||||
|
||||
gold_cols = set(str(c) for c in gold_raw.columns)
|
||||
polars_cols = set(str(c) for c in df_pandas.columns)
|
||||
|
||||
common_cols = gold_cols & polars_cols
|
||||
only_in_gold = gold_cols - polars_cols
|
||||
only_in_polars = polars_cols - gold_cols
|
||||
|
||||
print(f"\n Common columns: {len(common_cols)}")
|
||||
print(f" Only in gold standard: {len(only_in_gold)}")
|
||||
print(f" Only in polars: {len(only_in_polars)}")
|
||||
|
||||
if only_in_gold:
|
||||
print(f"\n Columns only in gold standard (first 20):")
|
||||
for col in list(only_in_gold)[:20]:
|
||||
print(f" {col}")
|
||||
|
||||
if only_in_polars:
|
||||
print(f"\n Columns only in polars (first 20):")
|
||||
for col in list(only_in_polars)[:20]:
|
||||
print(f" {col}")
|
||||
|
||||
# Check common columns values
|
||||
print("\n" + "=" * 80)
|
||||
print("STEP 6: Compare Values for Common Columns")
|
||||
print("=" * 80)
|
||||
|
||||
# Get common columns as tuples
|
||||
common_tuples = []
|
||||
for gc in gold_raw.columns:
|
||||
gc_str = str(gc)
|
||||
for pc in df_pandas.columns:
|
||||
if str(pc) == gc_str:
|
||||
common_tuples.append((gc, pc))
|
||||
break
|
||||
|
||||
print(f"\nComparing {len(common_tuples)} common columns...")
|
||||
|
||||
# Compare first few columns
|
||||
matching_count = 0
|
||||
diff_count = 0
|
||||
for i, (gc, pc) in enumerate(common_tuples[:20]):
|
||||
gold_vals = gold_raw[gc].dropna().values
|
||||
polars_vals = df_pandas[pc].dropna().values
|
||||
|
||||
if len(gold_vals) > 0 and len(polars_vals) > 0:
|
||||
# Compare min, max, mean
|
||||
if np.allclose([gold_vals.min(), gold_vals.max(), gold_vals.mean()],
|
||||
[polars_vals.min(), polars_vals.max(), polars_vals.mean()],
|
||||
rtol=1e-5):
|
||||
matching_count += 1
|
||||
else:
|
||||
diff_count += 1
|
||||
if diff_count <= 3:
|
||||
print(f" DIFF: {gc}")
|
||||
print(f" Gold: min={gold_vals.min():.6f}, max={gold_vals.max():.6f}, mean={gold_vals.mean():.6f}")
|
||||
print(f" Polars: min={polars_vals.min():.6f}, max={polars_vals.max():.6f}, mean={polars_vals.mean():.6f}")
|
||||
|
||||
print(f"\n Matching columns: {matching_count}")
|
||||
print(f" Different columns: {diff_count}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\nError running polars pipeline: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=" * 80)
|
||||
print("DATA DIVERGENCE DEBUG SCRIPT")
|
||||
print("Comparing gold-standard qlib output vs polars-based pipeline")
|
||||
print("=" * 80)
|
||||
|
||||
# Step 1: Check raw data
|
||||
gold_raw = compare_raw_data()
|
||||
|
||||
# Step 2: Check processed data
|
||||
gold_proc = compare_processed_data()
|
||||
|
||||
# Step 3: Analyze processor transformations
|
||||
analyze_processor_pipeline(gold_raw, gold_proc)
|
||||
|
||||
# Step 4 & 5: Run polars pipeline and compare
|
||||
check_polars_pipeline()
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("DEBUG COMPLETE")
|
||||
print("=" * 80)
|
||||
@ -1,421 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Dump Gold-Standard Data from Qlib Pipeline
|
||||
|
||||
This script exports processed feature data from the original Qlib pipeline
|
||||
in multiple formats for debugging and comparison with the standalone Polars implementation.
|
||||
|
||||
Usage:
|
||||
python dump_qlib_gold_standard.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir ../data/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import pickle as pkl
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
|
||||
# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
|
||||
if not hasattr(np, 'NaN'):
|
||||
np.NaN = np.nan
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Dump gold-standard data from Qlib pipeline"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--start-date",
|
||||
type=str,
|
||||
default="2020-01-02",
|
||||
help="Start date for data export (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end-date",
|
||||
type=str,
|
||||
default="2020-01-10",
|
||||
help="End date for data export (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="../data/",
|
||||
help="Output directory for exported files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--qlib-dataset-path",
|
||||
type=str,
|
||||
default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
|
||||
help="Path to Qlib dataset module",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_qlib_data(qlib_dataset_path, since_date):
|
||||
"""
|
||||
Load processed data from Qlib pipeline.
|
||||
|
||||
This function loads data using the original Qlib pipeline and handles
|
||||
the SepDataFrame return type by concatenating column groups.
|
||||
|
||||
Args:
|
||||
qlib_dataset_path: Path to the Qlib dataset module
|
||||
since_date: Start date for loading data (YYYY-MM-DD)
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Processed DataFrame from Qlib pipeline with all column groups concatenated
|
||||
"""
|
||||
import importlib.util
|
||||
import datetime as dt
|
||||
|
||||
# Patch ruamel.yaml to provide safe_load compatibility
|
||||
import ruamel.yaml as yaml
|
||||
|
||||
# Create a YAML instance with safe loader for backward compatibility
|
||||
_yaml = yaml.YAML(typ='safe', pure=True)
|
||||
|
||||
# Monkey-patch safe_load to use the new API
|
||||
def patched_safe_load(stream):
|
||||
import io
|
||||
if isinstance(stream, str):
|
||||
stream = io.StringIO(stream)
|
||||
return _yaml.load(stream)
|
||||
|
||||
yaml.safe_load = patched_safe_load
|
||||
|
||||
# Load the module directly
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"qlib_dataset",
|
||||
os.path.join(qlib_dataset_path, "__init__.py")
|
||||
)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
# Parse since_date
|
||||
since_date_dt = pd.to_datetime(since_date)
|
||||
# Load with extra history for Diff processor
|
||||
load_start = (since_date_dt - dt.timedelta(days=20)).strftime("%Y-%m-%d")
|
||||
|
||||
print(f" Loading data with handler (load_start={load_start})...")
|
||||
|
||||
# Use _load_from_yaml to get raw handler data (SepDataFrame)
|
||||
handler_data = module._load_from_yaml(
|
||||
os.path.join(qlib_dataset_path, "handler.yaml"),
|
||||
load_start
|
||||
)
|
||||
|
||||
# Handle SepDataFrame - extract and concatenate column groups
|
||||
if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
|
||||
# It's a SepDataFrame from AggHandler
|
||||
df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
|
||||
group_names = list(df_dict.keys())
|
||||
print(f" Handler returned SepDataFrame with groups: {group_names}")
|
||||
|
||||
# Concatenate all column groups into a single DataFrame
|
||||
all_dfs = []
|
||||
for group in group_names:
|
||||
df = df_dict[group]
|
||||
if df is not None and len(df.columns) > 0:
|
||||
df_copy = df.copy()
|
||||
# Add group prefix to columns
|
||||
df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
|
||||
all_dfs.append(df_copy)
|
||||
print(f" Group '{group}': {df_copy.shape}")
|
||||
|
||||
# Concatenate all groups along axis 1
|
||||
raw_df = pd.concat(all_dfs, axis=1)
|
||||
print(f" Concatenated raw data shape: {raw_df.shape}")
|
||||
else:
|
||||
raw_df = handler_data
|
||||
print(f" Raw data shape: {raw_df.shape}")
|
||||
|
||||
# Load processor list
|
||||
proc_path = os.path.join(qlib_dataset_path, "proc_list.proc")
|
||||
print(f" Loading processor list from: {proc_path}")
|
||||
with open(proc_path, "rb") as f:
|
||||
proc_list = pkl.load(f)
|
||||
print(f" Processor list has {len(proc_list)} processors")
|
||||
for i, proc in enumerate(proc_list):
|
||||
print(f" {i+1}. {type(proc).__name__}")
|
||||
|
||||
# Apply processors
|
||||
from qlib.contrib.data.utils import apply_proc_list
|
||||
print(f" Applying processor list (with_fit=False)...")
|
||||
|
||||
# The processor list expects columns without the group prefix
|
||||
# We need to strip the prefix before applying processors
|
||||
# Create a mapping and restore original column names
|
||||
col_mapping = {}
|
||||
for col in raw_df.columns:
|
||||
if '::' in col:
|
||||
original = col.split('::', 1)[1]
|
||||
col_mapping[col] = original
|
||||
|
||||
# Rename columns back to original names for processor application
|
||||
raw_df_renamed = raw_df.rename(columns=col_mapping)
|
||||
print(f" Renamed columns for processor compatibility. Shape: {raw_df_renamed.shape}")
|
||||
|
||||
# Convert boolean columns to object to avoid NaN -> int conversion issues
|
||||
bool_cols = raw_df_renamed.select_dtypes(include=['bool']).columns
|
||||
print(f" Converting {len(bool_cols)} boolean columns to object dtype")
|
||||
for col in bool_cols:
|
||||
raw_df_renamed[col] = raw_df_renamed[col].astype(object)
|
||||
|
||||
# Apply processors
|
||||
df = apply_proc_list(raw_df_renamed, proc_list=proc_list, with_fit=False)
|
||||
print(f" Applied processor list. Result shape: {df.shape}")
|
||||
|
||||
# Add back group prefixes to columns
|
||||
new_col_mapping = {v: k for k, v in col_mapping.items()}
|
||||
df = df.rename(columns=new_col_mapping)
|
||||
print(f" Restored column group prefixes. Shape: {df.shape}")
|
||||
|
||||
# Filter to requested date range
|
||||
df = df.loc(axis=0)[slice(since_date_dt, None)]
|
||||
print(f" Filtered to since_date={since_date}. Final shape: {df.shape}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def export_column_groups(df, output_dir, prefix="gold_standard"):
|
||||
"""
|
||||
Export separate files for different column groups.
|
||||
|
||||
Column groups:
|
||||
- feature: alpha158 + alpha158_ntrl
|
||||
- feature_ext: extended features (log_size_diff, etc.)
|
||||
- feature_flag: market flags (IsST, IsN, IsZt, IsDt, etc.)
|
||||
- indus_idx: industry index columns
|
||||
"""
|
||||
# Identify column groups based on naming conventions
|
||||
feature_cols = [c for c in df.columns if c.startswith("feature::")]
|
||||
feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
|
||||
feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
|
||||
indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
|
||||
|
||||
# Also include the ntrl suffixed columns
|
||||
feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
|
||||
|
||||
export_paths = {}
|
||||
|
||||
# Export feature columns (alpha158 + alpha158_ntrl)
|
||||
if feature_cols:
|
||||
feature_df = df[feature_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_feature.parquet")
|
||||
feature_df.to_parquet(path)
|
||||
export_paths["feature"] = path
|
||||
print(f" Exported feature columns ({len(feature_cols)}): {path}")
|
||||
|
||||
# Export feature_ext columns
|
||||
if feature_ext_cols:
|
||||
feature_ext_df = df[feature_ext_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_feature_ext.parquet")
|
||||
feature_ext_df.to_parquet(path)
|
||||
export_paths["feature_ext"] = path
|
||||
print(f" Exported feature_ext columns ({len(feature_ext_cols)}): {path}")
|
||||
|
||||
# Export feature_flag columns
|
||||
if feature_flag_cols:
|
||||
feature_flag_df = df[feature_flag_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_feature_flag.parquet")
|
||||
feature_flag_df.to_parquet(path)
|
||||
export_paths["feature_flag"] = path
|
||||
print(f" Exported feature_flag columns ({len(feature_flag_cols)}): {path}")
|
||||
|
||||
# Export indus_idx columns
|
||||
if indus_idx_cols:
|
||||
indus_idx_df = df[indus_idx_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_indus_idx.parquet")
|
||||
indus_idx_df.to_parquet(path)
|
||||
export_paths["indus_idx"] = path
|
||||
print(f" Exported indus_idx columns ({len(indus_idx_cols)}): {path}")
|
||||
|
||||
# Export feature_ntrl columns separately
|
||||
if feature_ntrl_cols:
|
||||
feature_ntrl_df = df[feature_ntrl_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_feature_ntrl.parquet")
|
||||
feature_ntrl_df.to_parquet(path)
|
||||
export_paths["feature_ntrl"] = path
|
||||
print(f" Exported feature_ntrl columns ({len(feature_ntrl_cols)}): {path}")
|
||||
|
||||
return export_paths
|
||||
|
||||
|
||||
def export_metadata(df, output_dir, prefix="gold_standard", proc_list_path=None):
|
||||
"""
|
||||
Export metadata about the dataset.
|
||||
|
||||
Includes:
|
||||
- Column names and shapes
|
||||
- Processor list configuration
|
||||
- Date range coverage
|
||||
- NaN value statistics
|
||||
"""
|
||||
metadata_path = os.path.join(output_dir, f"{prefix}_metadata.txt")
|
||||
|
||||
with open(metadata_path, "w") as f:
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("GOLD-STANDARD QLIB PIPELINE OUTPUT - METADATA\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
f.write(f"Export Date: {datetime.now().isoformat()}\n\n")
|
||||
|
||||
f.write("DATAFRAME SHAPE\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
f.write(f"Shape: {df.shape}\n")
|
||||
f.write(f"Rows: {len(df)}\n")
|
||||
f.write(f"Columns: {len(df.columns)}\n\n")
|
||||
|
||||
f.write("DATE RANGE\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
dates = df.index.get_level_values("datetime").unique()
|
||||
f.write(f"Min Date: {dates.min()}\n")
|
||||
f.write(f"Max Date: {dates.max()}\n")
|
||||
f.write(f"Unique Dates: {len(dates)}\n\n")
|
||||
|
||||
f.write("INSTRUMENTS\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
instruments = df.index.get_level_values("instrument").unique()
|
||||
f.write(f"Unique Instruments: {len(instruments)}\n")
|
||||
f.write(f"Sample Instruments: {list(instruments[:10])}\n\n")
|
||||
|
||||
f.write("COLUMN GROUPS\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
|
||||
# Categorize columns
|
||||
feature_cols = [c for c in df.columns if c.startswith("feature::")]
|
||||
feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
|
||||
feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
|
||||
indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
|
||||
feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
|
||||
|
||||
f.write(f"feature:: columns: {len(feature_cols)}\n")
|
||||
f.write(f"feature_ext:: columns: {len(feature_ext_cols)}\n")
|
||||
f.write(f"feature_flag:: columns: {len(feature_flag_cols)}\n")
|
||||
f.write(f"indus_idx:: columns: {len(indus_idx_cols)}\n")
|
||||
f.write(f"*_ntrl columns: {len(feature_ntrl_cols)}\n\n")
|
||||
|
||||
f.write("COLUMN DTYPES\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
dtype_counts = df.dtypes.value_counts()
|
||||
for dtype, count in dtype_counts.items():
|
||||
f.write(f"{dtype}: {count}\n")
|
||||
f.write("\n")
|
||||
|
||||
f.write("NAN STATISTICS\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
nan_counts = df.isna().sum()
|
||||
cols_with_nan = nan_counts[nan_counts > 0]
|
||||
f.write(f"Columns with NaN: {len(cols_with_nan)}\n")
|
||||
f.write(f"Total NaN values: {df.isna().sum().sum()}\n\n")
|
||||
|
||||
if len(cols_with_nan) > 0:
|
||||
f.write("NaN per column (top 20):\n")
|
||||
for col, cnt in cols_with_nan.nlargest(20).items():
|
||||
f.write(f" {col}: {cnt} ({100*cnt/len(df):.2f}%)\n")
|
||||
f.write("\n")
|
||||
|
||||
f.write("ALL COLUMN NAMES\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
for i, col in enumerate(df.columns):
|
||||
f.write(f" {i+1}. {col}\n")
|
||||
f.write("\n")
|
||||
|
||||
if proc_list_path and os.path.exists(proc_list_path):
|
||||
f.write("PROCESSOR LIST\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
f.write(f"Source: {proc_list_path}\n")
|
||||
try:
|
||||
with open(proc_list_path, "rb") as pf:
|
||||
proc_list = pkl.load(pf)
|
||||
f.write(f"Number of processors: {len(proc_list)}\n\n")
|
||||
for i, proc in enumerate(proc_list):
|
||||
f.write(f" {i+1}. {proc}\n")
|
||||
except Exception as e:
|
||||
f.write(f"Could not load processor list: {e}\n")
|
||||
f.write("\n")
|
||||
|
||||
print(f"Exported metadata: {metadata_path}")
|
||||
return metadata_path
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Parse dates
|
||||
start_date = pd.to_datetime(args.start_date)
|
||||
end_date = pd.to_datetime(args.end_date)
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
output_dir = Path(args.output_dir).resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("=" * 80)
|
||||
print("DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE")
|
||||
print("=" * 80)
|
||||
print(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
||||
print(f"Output Directory: {output_dir}")
|
||||
print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
|
||||
print()
|
||||
|
||||
# Load data from Qlib pipeline
|
||||
print("Step 1: Loading data from Qlib pipeline...")
|
||||
print(f" Loading since_date={start_date.strftime('%Y-%m-%d')}")
|
||||
|
||||
try:
|
||||
df = load_qlib_data(args.qlib_dataset_path, start_date.strftime("%Y-%m-%d"))
|
||||
print(f" Loaded DataFrame with shape: {df.shape}")
|
||||
except Exception as e:
|
||||
print(f" ERROR: Failed to load data from Qlib pipeline: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Filter to requested date range
|
||||
print("\nStep 2: Filtering to requested date range...")
|
||||
df = df.loc(axis=0)[slice(start_date, end_date)]
|
||||
print(f" Filtered shape: {df.shape}")
|
||||
|
||||
# Export full DataFrame
|
||||
print("\nStep 3: Exporting full DataFrame...")
|
||||
prefix = f"gold_standard_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
parquet_path = output_dir / f"{prefix}.parquet"
|
||||
df.to_parquet(parquet_path)
|
||||
print(f" Exported parquet: {parquet_path}")
|
||||
|
||||
pkl_path = output_dir / f"{prefix}.pkl"
|
||||
df.to_pickle(pkl_path)
|
||||
print(f" Exported pickle: {pkl_path}")
|
||||
|
||||
# Export column groups
|
||||
print("\nStep 4: Exporting column groups...")
|
||||
export_paths = export_column_groups(df, str(output_dir), prefix=prefix)
|
||||
|
||||
# Export metadata
|
||||
print("\nStep 5: Exporting metadata...")
|
||||
proc_list_path = os.path.join(args.qlib_dataset_path, "proc_list.proc")
|
||||
export_metadata(df, str(output_dir), prefix=prefix, proc_list_path=proc_list_path)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("EXPORT SUMMARY")
|
||||
print("=" * 80)
|
||||
print(f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
||||
print(f"Output directory: {output_dir}")
|
||||
print(f"Total rows: {len(df)}")
|
||||
print(f"Total columns: {len(df.columns)}")
|
||||
print(f"\nFiles exported:")
|
||||
print(f" - {prefix}.parquet (full DataFrame)")
|
||||
print(f" - {prefix}.pkl (pickle, preserves dtypes)")
|
||||
print(f" - {prefix}_metadata.txt (column info, statistics)")
|
||||
for group, path in export_paths.items():
|
||||
print(f" - {os.path.basename(path)} ({group} columns)")
|
||||
print("\nDone!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,270 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Dump Gold-Standard Data from Qlib Pipeline (Simple Version)
|
||||
|
||||
This script exports the RAW feature data from the Qlib pipeline BEFORE
|
||||
any processors are applied. This is useful for debugging and comparison.
|
||||
|
||||
NOTE: This script loads ALL data from DolphinDB and then filters to the
|
||||
requested date range. For large date ranges, this may require significant memory.
|
||||
|
||||
Usage:
|
||||
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import pickle as pkl
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
|
||||
if not hasattr(np, 'NaN'):
|
||||
np.NaN = np.nan
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Dump gold-standard raw data from Qlib pipeline",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Export a few days for debugging (recommended)
|
||||
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
|
||||
|
||||
# Export with custom output directory
|
||||
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir /path/to/output
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"--start-date",
|
||||
type=str,
|
||||
default="2020-01-02",
|
||||
help="Start date for data export (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end-date",
|
||||
type=str,
|
||||
default="2020-01-10",
|
||||
help="End date for data export (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="../data/",
|
||||
help="Output directory for exported files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--qlib-dataset-path",
|
||||
type=str,
|
||||
default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
|
||||
help="Path to Qlib dataset module",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--instruments",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Comma-separated list of instrument codes to export (default: all)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_raw_data(qlib_dataset_path, since_date, instruments=None):
|
||||
"""
|
||||
Load RAW data from Qlib pipeline (before processor list is applied).
|
||||
|
||||
Returns a dict of DataFrames, one per column group.
|
||||
|
||||
Args:
|
||||
qlib_dataset_path: Path to Qlib dataset module
|
||||
since_date: Start date for loading (needs history before for Diff)
|
||||
instruments: Optional list of instrument codes to filter
|
||||
"""
|
||||
import importlib.util
|
||||
import ruamel.yaml as yaml
|
||||
|
||||
# Create a YAML instance with safe loader for backward compatibility
|
||||
_yaml = yaml.YAML(typ='safe', pure=True)
|
||||
|
||||
def patched_safe_load(stream):
|
||||
import io
|
||||
if isinstance(stream, str):
|
||||
stream = io.StringIO(stream)
|
||||
return _yaml.load(stream)
|
||||
|
||||
yaml.safe_load = patched_safe_load
|
||||
|
||||
# Load the module directly
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"qlib_dataset",
|
||||
os.path.join(qlib_dataset_path, "__init__.py")
|
||||
)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
# Parse since_date
|
||||
since_date_dt = pd.to_datetime(since_date)
|
||||
# Load with extra history for Diff processor
|
||||
load_start = (since_date_dt - timedelta(days=20)).strftime("%Y-%m-%d")
|
||||
|
||||
print(f" Loading raw data from handler (load_start={load_start})...")
|
||||
if instruments:
|
||||
print(f" Filtering instruments: {instruments[:5]}... ({len(instruments)} total)")
|
||||
|
||||
# Use _load_from_yaml to get raw handler data (SepDataFrame)
|
||||
handler_data = module._load_from_yaml(
|
||||
os.path.join(qlib_dataset_path, "handler.yaml"),
|
||||
load_start
|
||||
)
|
||||
|
||||
# Handle SepDataFrame - extract column groups
|
||||
if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
|
||||
df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
|
||||
group_names = list(df_dict.keys())
|
||||
print(f" Handler returned SepDataFrame with groups: {group_names}")
|
||||
|
||||
# Filter instruments if specified
|
||||
if instruments:
|
||||
print(f" Filtering to specified instruments...")
|
||||
for group in group_names:
|
||||
if df_dict[group] is not None:
|
||||
df = df_dict[group]
|
||||
# Filter by instrument level
|
||||
if isinstance(df.index, pd.MultiIndex):
|
||||
mask = df.index.get_level_values('instrument').isin(instruments)
|
||||
df_dict[group] = df[mask]
|
||||
print(f" Group '{group}': {df_dict[group].shape} (filtered)")
|
||||
|
||||
for group in group_names:
|
||||
df = df_dict[group]
|
||||
if df is not None:
|
||||
print(f" Group '{group}': shape={df.shape}, columns={len(df.columns)}")
|
||||
|
||||
return df_dict, handler_data.index
|
||||
else:
|
||||
print(f" Handler returned DataFrame: shape={handler_data.shape}")
|
||||
return {"default": handler_data}, handler_data.index
|
||||
|
||||
|
||||
def export_data(df_dict, index, output_dir, start_date, end_date):
|
||||
"""Export data to parquet and pickle files."""
|
||||
output_dir = Path(output_dir).resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
start_date = pd.to_datetime(start_date)
|
||||
end_date = pd.to_datetime(end_date)
|
||||
|
||||
# Filter index
|
||||
mask = (index >= start_date) & (index <= end_date)
|
||||
filtered_index = index[mask]
|
||||
|
||||
print(f"\nExporting data for date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
||||
print(f" Filtered index has {len(filtered_index)} dates")
|
||||
|
||||
prefix = f"gold_standard_raw_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
exported_files = []
|
||||
|
||||
# Export each group separately
|
||||
for group, df in df_dict.items():
|
||||
if df is None or len(df.columns) == 0:
|
||||
print(f" Skipping empty group '{group}'")
|
||||
continue
|
||||
|
||||
# Filter by date
|
||||
df_filtered = df.loc[df.index.isin(filtered_index)]
|
||||
print(f" Group '{group}': {df_filtered.shape}")
|
||||
|
||||
# Export to parquet
|
||||
parquet_path = output_dir / f"{prefix}_{group}.parquet"
|
||||
df_filtered.to_parquet(parquet_path)
|
||||
exported_files.append(str(parquet_path))
|
||||
print(f" -> {parquet_path}")
|
||||
|
||||
# Export to pickle (preserves dtypes)
|
||||
pkl_path = output_dir / f"{prefix}_{group}.pkl"
|
||||
df_filtered.to_pickle(pkl_path)
|
||||
exported_files.append(str(pkl_path))
|
||||
|
||||
# Also create a metadata file
|
||||
metadata_path = output_dir / f"{prefix}_metadata.txt"
|
||||
with open(metadata_path, "w") as f:
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("GOLD-STANDARD RAW DATA - METADATA\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
f.write(f"Export Date: {datetime.now().isoformat()}\n")
|
||||
f.write(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\n")
|
||||
f.write(f"Total Dates: {len(filtered_index)}\n\n")
|
||||
|
||||
f.write("COLUMN GROUPS:\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
for group, df in df_dict.items():
|
||||
if df is not None:
|
||||
f.write(f" {group}:\n")
|
||||
f.write(f" Shape: {df.shape}\n")
|
||||
f.write(f" Columns: {len(df.columns)}\n")
|
||||
f.write(f" Sample columns: {list(df.columns[:5])}...\n\n")
|
||||
|
||||
f.write("\nPROCESSOR LIST (for reference):\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
|
||||
if os.path.exists(proc_path):
|
||||
with open(proc_path, "rb") as pf:
|
||||
proc_list = pkl.load(pf)
|
||||
f.write(f"Number of processors: {len(proc_list)}\n\n")
|
||||
for i, proc in enumerate(proc_list):
|
||||
f.write(f" {i+1}. {type(proc).__module__}.{type(proc).__name__}\n")
|
||||
else:
|
||||
f.write(f"Processor list not found: {proc_path}\n")
|
||||
|
||||
exported_files.append(str(metadata_path))
|
||||
|
||||
return exported_files
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
print("=" * 80)
|
||||
print("DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE")
|
||||
print("=" * 80)
|
||||
print(f"Date Range: {args.start_date} to {args.end_date}")
|
||||
print(f"Output Directory: {args.output_dir}")
|
||||
print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
|
||||
print()
|
||||
|
||||
# Load raw data
|
||||
print("Step 1: Loading raw data from Qlib pipeline...")
|
||||
try:
|
||||
instruments = None
|
||||
if args.instruments:
|
||||
instruments = args.instruments.split(',')
|
||||
df_dict, index = load_raw_data(args.qlib_dataset_path, args.start_date, instruments=instruments)
|
||||
except Exception as e:
|
||||
print(f" ERROR: Failed to load data: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
# Export data
|
||||
print("\nStep 2: Exporting data...")
|
||||
exported_files = export_data(df_dict, index, args.output_dir, args.start_date, args.end_date)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("EXPORT SUMMARY")
|
||||
print("=" * 80)
|
||||
print(f"Date range: {args.start_date} to {args.end_date}")
|
||||
print(f"Output directory: {Path(args.output_dir).resolve()}")
|
||||
print(f"\nFiles exported ({len(exported_files)}):")
|
||||
for f in exported_files:
|
||||
print(f" - {f}")
|
||||
print("\nDone!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,186 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Regenerate beta embeddings for a few days of sample data.
|
||||
|
||||
This script generates embeddings for a small date range to test the pipeline.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pickle as pkl
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
# Import from the main generate script
|
||||
from generate_beta_embedding import (
|
||||
load_all_data,
|
||||
merge_data_sources,
|
||||
apply_feature_pipeline,
|
||||
prepare_vae_features,
|
||||
load_vae_model,
|
||||
encode_with_vae,
|
||||
load_qlib_processor_params,
|
||||
VAE_INPUT_DIM,
|
||||
OUTPUT_DIR,
|
||||
)
|
||||
|
||||
# Sample dates for testing (5 consecutive trading days)
|
||||
SAMPLE_DATES = [
|
||||
"2019-01-02",
|
||||
"2019-01-03",
|
||||
"2019-01-04",
|
||||
"2019-01-07",
|
||||
"2019-01-08",
|
||||
]
|
||||
|
||||
VAE_MODEL_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/model/csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/module.pt"
|
||||
|
||||
|
||||
def generate_sample_embeddings(
|
||||
dates: List[str] = SAMPLE_DATES,
|
||||
output_file: str = "embedding_0_7_beta_sample.parquet",
|
||||
use_vae: bool = True
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Generate embeddings for a sample of dates.
|
||||
|
||||
Args:
|
||||
dates: List of dates in YYYY-MM-DD format
|
||||
output_file: Output parquet file path
|
||||
use_vae: Whether to use VAE for encoding (or random embeddings)
|
||||
"""
|
||||
start_date = dates[0]
|
||||
end_date = dates[-1]
|
||||
|
||||
print("=" * 60)
|
||||
print("Generating Sample Beta Embeddings")
|
||||
print(f"Dates: {dates}")
|
||||
print(f"Use VAE: {use_vae}")
|
||||
print("=" * 60)
|
||||
|
||||
# Load all data sources
|
||||
df_alpha, df_kline, df_flag, df_industry = load_all_data(start_date, end_date)
|
||||
|
||||
print(f"\nLoaded data:")
|
||||
print(f" Alpha158: {df_alpha.shape}")
|
||||
print(f" Kline: {df_kline.shape}")
|
||||
print(f" Flags: {df_flag.shape}")
|
||||
print(f" Industry: {df_industry.shape}")
|
||||
|
||||
# Filter to only the sample dates
|
||||
date_ints = [int(d.replace("-", "")) for d in dates]
|
||||
df_alpha = df_alpha.filter(pl.col("datetime").is_in(date_ints))
|
||||
df_kline = df_kline.filter(pl.col("datetime").is_in(date_ints))
|
||||
df_flag = df_flag.filter(pl.col("datetime").is_in(date_ints))
|
||||
df_industry = df_industry.filter(pl.col("datetime").is_in(date_ints))
|
||||
|
||||
print(f"\nAfter filtering to sample dates:")
|
||||
print(f" Alpha158: {df_alpha.shape}")
|
||||
print(f" Kline: {df_kline.shape}")
|
||||
print(f" Flags: {df_flag.shape}")
|
||||
print(f" Industry: {df_industry.shape}")
|
||||
|
||||
# Merge data sources
|
||||
df = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
|
||||
print(f"\nMerged data shape: {df.shape}")
|
||||
|
||||
# Save datetime and instrument before processing
|
||||
datetime_col = df["datetime"].clone()
|
||||
instrument_col = df["instrument"].clone()
|
||||
|
||||
# Apply feature transformation pipeline
|
||||
df_processed, feature_cols, norm_feature_cols, market_flag_for_vae = apply_feature_pipeline(df)
|
||||
|
||||
# Prepare features for VAE
|
||||
features = prepare_vae_features(
|
||||
df_processed, feature_cols,
|
||||
norm_feature_cols=norm_feature_cols,
|
||||
market_flag_for_vae=market_flag_for_vae
|
||||
)
|
||||
|
||||
print(f"\nFeature matrix shape: {features.shape}")
|
||||
|
||||
# Encode with VAE
|
||||
if use_vae:
|
||||
try:
|
||||
model = load_vae_model(VAE_MODEL_PATH)
|
||||
embeddings = encode_with_vae(features, model)
|
||||
print(f"\nVAE encoding successful!")
|
||||
except Exception as e:
|
||||
print(f"\nVAE encoding failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print("\nFalling back to random embeddings...")
|
||||
np.random.seed(42)
|
||||
embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
|
||||
else:
|
||||
print("\nUsing random embeddings (VAE disabled)...")
|
||||
np.random.seed(42)
|
||||
embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
|
||||
|
||||
# Create output DataFrame
|
||||
embedding_cols = [f"embedding_{i}" for i in range(embeddings.shape[1])]
|
||||
|
||||
result_data = {
|
||||
"datetime": datetime_col.to_list(),
|
||||
"instrument": instrument_col.to_list(),
|
||||
**{col_name: embeddings[:, i].tolist() for i, col_name in enumerate(embedding_cols)}
|
||||
}
|
||||
|
||||
df_result = pl.DataFrame(result_data)
|
||||
|
||||
# Ensure output directory exists
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save to parquet
|
||||
df_result.write_parquet(output_path)
|
||||
print(f"\nEmbeddings saved to: {output_path}")
|
||||
print(f"Output shape: {df_result.shape}")
|
||||
print(f"\nSample output:")
|
||||
print(df_result.head(10))
|
||||
|
||||
# Print summary statistics
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary Statistics")
|
||||
print("=" * 60)
|
||||
print(f"Total samples: {len(df_result)}")
|
||||
print(f"Embedding dimension: {embeddings.shape[1]}")
|
||||
print(f"Date range: {df_result['datetime'].min()} to {df_result['datetime'].max()}")
|
||||
print(f"Instruments: {df_result['instrument'].n_unique()}")
|
||||
print(f"Embedding mean: {np.mean(embeddings):.6f}")
|
||||
print(f"Embedding std: {np.std(embeddings):.6f}")
|
||||
print(f"Embedding min: {np.min(embeddings):.6f}")
|
||||
print(f"Embedding max: {np.max(embeddings):.6f}")
|
||||
|
||||
return df_result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Generate sample beta embeddings")
|
||||
parser.add_argument("--dates", nargs="+", default=SAMPLE_DATES,
|
||||
help="List of dates (YYYY-MM-DD)")
|
||||
parser.add_argument("--output", type=str, default="embedding_0_7_beta_sample.parquet",
|
||||
help="Output parquet file")
|
||||
parser.add_argument("--no-vae", action="store_true",
|
||||
help="Skip VAE encoding (use random embeddings)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
generate_sample_embeddings(
|
||||
dates=args.dates,
|
||||
output_file=args.output,
|
||||
use_vae=not args.no_vae
|
||||
)
|
||||
|
||||
print("\nDone!")
|
||||
@ -1,394 +0,0 @@
|
||||
[2715583:MainThread](2026-02-26 19:58:16,674) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2715583:MainThread](2026-02-26 19:58:16,680) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2715583:MainThread](2026-02-26 19:58:16,681) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading data from Qlib pipeline...
|
||||
Loading since_date=2020-01-02
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-12-03 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26)[2715583:MainThread](2026-02-26 19:58:16,707) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2715583:MainThread](2026-02-26 19:58:16,707) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2715583:MainThread](2026-02-26 19:58:17,067) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
[2715583:MainThread](2026-02-26 20:05:39,665) INFO - qlib.timer - [log.py:117] - Time cost: 442.946s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:05:40,469) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
,
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-12-03 00:00:00')},
|
||||
'module_path': 'qlib.contrib.data.agg_handler'},
|
||||
'load_end': datetime.date(2026, 2, 26),
|
||||
'load_start': Timestamp('2019-12-03 00:00:00'),
|
||||
'market': 'csiallx',
|
||||
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
|
||||
'region': 'cn'}}
|
||||
Query config:
|
||||
#alpha158: 1;
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88b0587d0>
|
||||
[2715583:MainThread](2026-02-26 20:07:46,118) INFO - qlib.timer - [log.py:117] - Time cost: 115.964s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:07:53,273) INFO - qlib.timer - [log.py:117] - Time cost: 576.561s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x70e88b086d80>) Done
|
||||
[2715583:MainThread](2026-02-26 20:07:53,274) INFO - qlib.timer - [log.py:117] - Time cost: 576.562s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x70e88b086d80>) Done
|
||||
[2715583:MainThread](2026-02-26 20:07:53,276) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:07:56,700) INFO - qlib.timer - [log.py:117] - Time cost: 3.423s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:07:58,185) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
KMID KLEN ... VSUMD30 VSUMD60
|
||||
datetime instrument ...
|
||||
2019-12-03 SH600000 0.004234 0.011008 ... -0.031454 -0.009671
|
||||
SH600004 0.015467 0.031529 ... -0.004401 0.007701
|
||||
SH600006 0.022573 0.033860 ... 0.060561 -0.000159
|
||||
SH600007 0.012129 0.025470 ... 0.008489 -0.054056
|
||||
SH600008 0.006173 0.009259 ... -0.088065 -0.080770
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
|
||||
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
|
||||
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
|
||||
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
|
||||
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
|
||||
|
||||
[6886779 rows x 158 columns]
|
||||
[2715583:MainThread](2026-02-26 20:07:58,186) INFO - qlib.timer - [log.py:117] - Time cost: 4.911s | Fetching dataframe Done
|
||||
[2715583:MainThread](2026-02-26 20:07:58,203) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
|
||||
[2715583:MainThread](2026-02-26 20:08:15,182) INFO - qlib.timer - [log.py:117] - Time cost: 16.990s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:08:15,974) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:08:16,548) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,con_rating_strength from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
|
||||
[2715583:MainThread](2026-02-26 20:08:27,838) INFO - qlib.timer - [log.py:117] - Time cost: 11.299s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:08:28,690) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:09:53,616) INFO - qlib.timer - [log.py:117] - Time cost: 81.815s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,168) INFO - qlib.timer - [log.py:117] - Time cost: 115.981s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x70ea4ba85f40>) Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,169) INFO - qlib.timer - [log.py:117] - Time cost: 115.982s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x70ea4ba85f40>) Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,170) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,893) INFO - qlib.timer - [log.py:117] - Time cost: 0.723s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,901) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
turnover free_turnover log_size con_rating_strength
|
||||
datetime instrument
|
||||
2019-12-03 SH600000 0.0696 0.1275 17.322001 0.6618
|
||||
SH600004 0.6009 1.2276 15.077468 0.8269
|
||||
SH600006 0.5976 1.5087 13.716795 1.0000
|
||||
SH600007 0.0961 0.4969 14.334991 0.7500
|
||||
SH600008 0.0967 0.1793 14.432563 0.6591
|
||||
... ... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
|
||||
SZ301662 12.5950 12.5950 12.681215 NaN
|
||||
SZ301665 14.0077 14.0077 11.719415 NaN
|
||||
SZ301678 6.6518 6.6518 12.799973 NaN
|
||||
SZ302132 1.3868 3.0296 15.359885 NaN
|
||||
|
||||
[7601552 rows x 4 columns]
|
||||
[2715583:MainThread](2026-02-26 20:09:54,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.732s | Fetching dataframe Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,917) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
|
||||
[2715583:MainThread](2026-02-26 20:10:15,465) INFO - qlib.timer - [log.py:117] - Time cost: 20.556s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:10:16,265) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:10:16,775) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
|
||||
[2715583:MainThread](2026-02-26 20:10:36,740) INFO - qlib.timer - [log.py:117] - Time cost: 19.975s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:10:37,558) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:12:04,978) INFO - qlib.timer - [log.py:117] - Time cost: 84.148s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:12:05,899) INFO - qlib.timer - [log.py:117] - Time cost: 130.996s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x70e88d06acc0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:12:05,900) INFO - qlib.timer - [log.py:117] - Time cost: 130.997s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x70e88d06acc0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:12:05,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:12:06,745) INFO - qlib.timer - [log.py:117] - Time cost: 0.842s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:12:06,758) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
IsZt IsDt IsN ... open_stop close_stop high_stop
|
||||
datetime instrument ...
|
||||
2019-12-03 SH600000 False False False ... False False False
|
||||
SH600004 False False False ... False False False
|
||||
SH600006 False False False ... False False False
|
||||
SH600007 False False False ... False False False
|
||||
SH600008 False False False ... False False False
|
||||
... ... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False ... False False False
|
||||
SZ301662 False False False ... False False False
|
||||
SZ301665 False False False ... False False False
|
||||
SZ301678 False False False ... False False False
|
||||
SZ302132 False False False ... False False False
|
||||
|
||||
[6903684 rows x 12 columns]
|
||||
[2715583:MainThread](2026-02-26 20:12:06,759) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
|
||||
[2715583:MainThread](2026-02-26 20:12:06,777) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
|
||||
[2715583:MainThread](2026-02-26 20:12:08,840) INFO - qlib.timer - [log.py:117] - Time cost: 2.073s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:12:08,849) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:13:26,572) INFO - qlib.timer - [log.py:117] - Time cost: 77.719s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,601) INFO - qlib.timer - [log.py:117] - Time cost: 79.839s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x70e67060acc0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,602) INFO - qlib.timer - [log.py:117] - Time cost: 79.840s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x70e67060acc0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,603) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,612) INFO - qlib.timer - [log.py:117] - Time cost: 0.008s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,633) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 False False ... False False
|
||||
SH600004 False False ... False False
|
||||
SH600006 False False ... False False
|
||||
SH600007 False False ... False False
|
||||
SH600008 False False ... False False
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False ... False False
|
||||
SZ301662 False False ... False False
|
||||
SZ301665 False False ... False False
|
||||
SZ301678 False False ... False False
|
||||
SZ302132 False False ... False False
|
||||
|
||||
[41168 rows x 30 columns]
|
||||
[2715583:MainThread](2026-02-26 20:13:26,634) INFO - qlib.timer - [log.py:117] - Time cost: 0.031s | Fetching dataframe Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,652) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
|
||||
[2715583:MainThread](2026-02-26 20:13:55,744) INFO - qlib.timer - [log.py:117] - Time cost: 29.102s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:13:56,520) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:15:27,625) INFO - qlib.timer - [log.py:117] - Time cost: 90.586s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.621s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x70e88c9710d0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.622s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x70e88c9710d0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,867) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,875) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
ST_Y ST_S ST_T ST_L ST_Z ST_X
|
||||
datetime instrument
|
||||
2019-12-03 SH600000 False False False False False False
|
||||
SH600004 False False False False False False
|
||||
SH600006 False False False False False False
|
||||
SH600007 False False False False False False
|
||||
SH600008 False False False False False False
|
||||
... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False False False False
|
||||
SZ301662 False False False False False False
|
||||
SZ301665 False False False False False False
|
||||
SZ301678 False False False False False False
|
||||
SZ302132 False False False False False False
|
||||
|
||||
[6903687 rows x 6 columns]
|
||||
[2715583:MainThread](2026-02-26 20:15:28,876) INFO - qlib.timer - [log.py:117] - Time cost: 0.617s | Fetching dataframe Done
|
||||
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
|
||||
group_list = [_df.resample("M", level="datetime")\
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88bd2a000>
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88bd2a000>
|
||||
Query config:
|
||||
#concepts: 2;
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88c9cf6e0>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88c9cf6e0>
|
||||
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e6706082f0>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e65fdafd40>
|
||||
[2715583:MainThread](2026-02-26 20:15:32,735) INFO - qlib.timer - [log.py:117] - Time cost: 3.858s | Concat index: Done
|
||||
[2715583:MainThread](2026-02-26 20:15:32,737) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
|
||||
[2715583:MainThread](2026-02-26 20:15:36,349) INFO - qlib.timer - [log.py:117] - Time cost: 3.611s | Creating SepDataFrame: Done
|
||||
[2715583:MainThread](2026-02-26 20:15:37,245) INFO - qlib.timer - [log.py:117] - Time cost: 1040.537s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x70e88b12b3e0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:15:37,246) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2715583:MainThread](2026-02-26 20:15:37,248) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2715583:MainThread](2026-02-26 20:15:37,265) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2715583:MainThread](2026-02-26 20:15:37,266) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2715583:MainThread](2026-02-26 20:15:37,293) INFO - qlib.timer - [log.py:117] - Time cost: 0.047s | fit & process data Done
|
||||
[2715583:MainThread](2026-02-26 20:15:37,294) INFO - qlib.timer - [log.py:117] - Time cost: 1040.587s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x70e88b12b3e0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:15:37,963) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor <qlib.contrib.data.processor.common.Diff object at 0x70e88bf4af30>
|
||||
[2715583:MainThread](2026-02-26 20:15:40,135) INFO - qlib.timer - [log.py:117] - Time cost: 2.171s | Diff Done
|
||||
[2715583:MainThread](2026-02-26 20:15:40,136) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor <qlib.contrib.data.processor.flag.FlagMarketInjector object at 0x70e88cd8fd40>
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
Did load data from config: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml
|
||||
Did load norm from: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc
|
||||
Will assign `feature_ext` with
|
||||
turnover ... con_rating_strength_diff
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 0.1837 ... 0.0
|
||||
SH600004 0.6948 ... 0.0
|
||||
SH600006 0.5542 ... 0.0
|
||||
SH600007 0.2057 ... 0.0
|
||||
SH600008 0.9809 ... 0.0
|
||||
... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 ... 0.0
|
||||
SZ301662 12.5950 ... 0.0
|
||||
SZ301665 14.0077 ... 0.0
|
||||
SZ301678 6.6518 ... 0.0
|
||||
SZ302132 1.3868 ... 0.0
|
||||
|
||||
[41085 rows x 8 columns]
|
||||
---
|
||||
ERROR: Failed to load data from Qlib pipeline: Cannot convert non-finite values (NA or inf) to integer
|
||||
@ -1,373 +0,0 @@
|
||||
[2730312:MainThread](2026-02-26 21:28:33,675) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2730312:MainThread](2026-02-26 21:28:33,679) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2730312:MainThread](2026-02-26 21:28:33,680) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading data from Qlib pipeline...
|
||||
Loading since_date=2020-01-02
|
||||
Loading raw data from handler.yaml...
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-12-13 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': [2730312:MainThread](2026-02-26 21:28:33,704) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2730312:MainThread](2026-02-26 21:28:33,704) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2730312:MainThread](2026-02-26 21:28:34,011) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
[2730312:MainThread](2026-02-26 21:36:00,317) INFO - qlib.timer - [log.py:117] - Time cost: 446.602s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:36:01,106) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-12-13 00:00:00')},
|
||||
'module_path': 'qlib.contrib.data.agg_handler'},
|
||||
'load_end': datetime.date(2026, 2, 26),
|
||||
'load_start': Timestamp('2019-12-13 00:00:00'),
|
||||
'market': 'csiallx',
|
||||
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
|
||||
'region': 'cn'}}
|
||||
Query config:
|
||||
#alpha158: 1;
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f37e75a60>
|
||||
[2730312:MainThread](2026-02-26 21:38:13,636) INFO - qlib.timer - [log.py:117] - Time cost: 123.423s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:38:20,733) INFO - qlib.timer - [log.py:117] - Time cost: 587.024s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x761f366b3bc0>) Done
|
||||
[2730312:MainThread](2026-02-26 21:38:20,734) INFO - qlib.timer - [log.py:117] - Time cost: 587.026s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x761f366b3bc0>) Done
|
||||
[2730312:MainThread](2026-02-26 21:38:20,736) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:38:24,302) INFO - qlib.timer - [log.py:117] - Time cost: 3.564s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:38:25,946) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
KMID KLEN ... VSUMD30 VSUMD60
|
||||
datetime instrument ...
|
||||
2019-12-13 SH600000 0.011686 0.015025 ... -0.011573 0.039735
|
||||
SH600004 0.000000 0.009169 ... -0.146051 0.024757
|
||||
SH600006 -0.004329 0.015152 ... 0.136883 0.024626
|
||||
SH600007 0.005590 0.019005 ... -0.012912 0.017215
|
||||
SH600008 0.012270 0.012270 ... 0.039878 -0.013888
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
|
||||
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
|
||||
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
|
||||
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
|
||||
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
|
||||
|
||||
[6858048 rows x 158 columns]
|
||||
[2730312:MainThread](2026-02-26 21:38:25,947) INFO - qlib.timer - [log.py:117] - Time cost: 5.212s | Fetching dataframe Done
|
||||
[2730312:MainThread](2026-02-26 21:38:25,965) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
|
||||
[2730312:MainThread](2026-02-26 21:38:43,081) INFO - qlib.timer - [log.py:117] - Time cost: 17.127s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:38:43,874) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:38:44,458) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,con_rating_strength from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
|
||||
[2730312:MainThread](2026-02-26 21:38:55,720) INFO - qlib.timer - [log.py:117] - Time cost: 11.271s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:38:56,586) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:40:21,007) INFO - qlib.timer - [log.py:117] - Time cost: 81.315s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.627s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7620fb822930>) Done
|
||||
[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.628s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7620fb822930>) Done
|
||||
[2730312:MainThread](2026-02-26 21:40:21,577) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:40:22,309) INFO - qlib.timer - [log.py:117] - Time cost: 0.731s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:40:22,317) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
turnover free_turnover log_size con_rating_strength
|
||||
datetime instrument
|
||||
2019-12-13 SH600000 0.2118 0.3879 17.343685 0.7143
|
||||
SH600004 0.7518 1.5357 15.099485 0.8214
|
||||
SH600006 0.7827 1.9762 13.732129 1.0000
|
||||
SH600007 0.1368 0.7071 14.409998 0.7500
|
||||
SH600008 0.2152 0.3990 14.444757 0.7500
|
||||
... ... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
|
||||
SZ301662 12.5950 12.5950 12.681215 NaN
|
||||
SZ301665 14.0077 14.0077 11.719415 NaN
|
||||
SZ301678 6.6518 6.6518 12.799973 NaN
|
||||
SZ302132 1.3868 3.0296 15.359885 NaN
|
||||
|
||||
[7572626 rows x 4 columns]
|
||||
[2730312:MainThread](2026-02-26 21:40:22,318) INFO - qlib.timer - [log.py:117] - Time cost: 0.741s | Fetching dataframe Done
|
||||
[2730312:MainThread](2026-02-26 21:40:22,334) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
|
||||
[2730312:MainThread](2026-02-26 21:40:43,075) INFO - qlib.timer - [log.py:117] - Time cost: 20.751s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:40:43,889) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:40:44,394) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
|
||||
[2730312:MainThread](2026-02-26 21:41:04,632) INFO - qlib.timer - [log.py:117] - Time cost: 20.246s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:41:05,434) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:42:33,029) INFO - qlib.timer - [log.py:117] - Time cost: 84.294s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,049) INFO - qlib.timer - [log.py:117] - Time cost: 131.730s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x761f37e74470>) Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,050) INFO - qlib.timer - [log.py:117] - Time cost: 131.731s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x761f37e74470>) Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,051) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,895) INFO - qlib.timer - [log.py:117] - Time cost: 0.843s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,907) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
IsZt IsDt IsN ... open_stop close_stop high_stop
|
||||
datetime instrument ...
|
||||
2019-12-13 SH600000 False False False ... False False False
|
||||
SH600004 False False False ... False False False
|
||||
SH600006 False False False ... False False False
|
||||
SH600007 False False False ... False False False
|
||||
SH600008 False False False ... False False False
|
||||
... ... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False ... False False False
|
||||
SZ301662 False False False ... False False False
|
||||
SZ301665 False False False ... False False False
|
||||
SZ301678 False False False ... False False False
|
||||
SZ302132 False False False ... False False False
|
||||
|
||||
[6874830 rows x 12 columns]
|
||||
[2730312:MainThread](2026-02-26 21:42:34,908) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,927) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
|
||||
[2730312:MainThread](2026-02-26 21:42:36,986) INFO - qlib.timer - [log.py:117] - Time cost: 2.069s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:42:36,996) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:43:53,198) INFO - qlib.timer - [log.py:117] - Time cost: 76.199s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,230) INFO - qlib.timer - [log.py:117] - Time cost: 78.318s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x761f381e1c40>) Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 78.319s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x761f381e1c40>) Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,239) INFO - qlib.timer - [log.py:117] - Time cost: 0.007s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,257) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 False False ... False False
|
||||
SH600004 False False ... False False
|
||||
SH600006 False False ... False False
|
||||
SH600007 False False ... False False
|
||||
SH600008 False False ... False False
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False ... False False
|
||||
SZ301662 False False ... False False
|
||||
SZ301665 False False ... False False
|
||||
SZ301678 False False ... False False
|
||||
SZ302132 False False ... False False
|
||||
|
||||
[41168 rows x 30 columns]
|
||||
[2730312:MainThread](2026-02-26 21:43:53,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.027s | Fetching dataframe Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,274) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
|
||||
[2730312:MainThread](2026-02-26 21:44:44,876) INFO - qlib.timer - [log.py:117] - Time cost: 51.611s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:44:45,602) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:46:07,184) INFO - qlib.timer - [log.py:117] - Time cost: 81.056s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:46:07,747) INFO - qlib.timer - [log.py:117] - Time cost: 134.487s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x761f387b3080>) Done
|
||||
[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 134.488s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x761f387b3080>) Done
|
||||
[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:46:08,349) INFO - qlib.timer - [log.py:117] - Time cost: 0.600s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:46:08,358) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
ST_Y ST_S ST_T ST_L ST_Z ST_X
|
||||
datetime instrument
|
||||
2019-12-13 SH600000 False False False False False False
|
||||
SH600004 False False False False False False
|
||||
SH600006 False False False False False False
|
||||
SH600007 False False False False False False
|
||||
SH600008 False False False False False False
|
||||
... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False False False False
|
||||
SZ301662 False False False False False False
|
||||
SZ301665 False False False False False False
|
||||
SZ301678 False False False False False False
|
||||
SZ302132 False False False False False False
|
||||
|
||||
[6874833 rows x 6 columns]
|
||||
[2730312:MainThread](2026-02-26 21:46:08,359) INFO - qlib.timer - [log.py:117] - Time cost: 0.610s | Fetching dataframe Done
|
||||
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
|
||||
group_list = [_df.resample("M", level="datetime")\
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x76203dfa91f0>
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x76203dfa91f0>
|
||||
Query config:
|
||||
#concepts: 2;
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f346e7aa0>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f346e7aa0>
|
||||
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761cc3995760>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761a968d1d00>
|
||||
[2730312:MainThread](2026-02-26 21:46:11,623) INFO - qlib.timer - [log.py:117] - Time cost: 3.264s | Concat index: Done
|
||||
[2730312:MainThread](2026-02-26 21:46:11,625) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
|
||||
[2730312:MainThread](2026-02-26 21:46:15,058) INFO - qlib.timer - [log.py:117] - Time cost: 3.433s | Creating SepDataFrame: Done
|
||||
[2730312:MainThread](2026-02-26 21:46:15,928) INFO - qlib.timer - [log.py:117] - Time cost: 1062.224s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x761f36612720>) Done
|
||||
[2730312:MainThread](2026-02-26 21:46:15,929) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2730312:MainThread](2026-02-26 21:46:15,931) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2730312:MainThread](2026-02-26 21:46:15,935) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2730312:MainThread](2026-02-26 21:46:15,936) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2730312:MainThread](2026-02-26 21:46:15,939) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2730312:MainThread](2026-02-26 21:46:15,940) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 0.014s | fit & process data Done
|
||||
[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 1062.239s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x761f36612720>) Done
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'
|
||||
@ -1,373 +0,0 @@
|
||||
[2734404:MainThread](2026-02-26 22:10:11,609) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading data from Qlib pipeline...
|
||||
Loading since_date=2020-01-02
|
||||
Loading data with handler (load_start=2019-12-13)...
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
[2734404:MainThread](2026-02-26 22:10:11,634) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2734404:MainThread](2026-02-26 22:10:11,634) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2734404:MainThread](2026-02-26 22:10:11,842) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
[2734404:MainThread](2026-02-26 22:17:41,432) INFO - qlib.timer - [log.py:117] - Time cost: 449.788s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:17:42,271) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'module_path': 'qlib.contrib.data.agg_handler'},
|
||||
'load_end': datetime.date(2026, 2, 26),
|
||||
'load_start': Timestamp('2019-11-23 00:00:00'),
|
||||
'market': 'csiallx',
|
||||
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
|
||||
'region': 'cn'}}
|
||||
Query config:
|
||||
#alpha158: 1;
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e04773e0>
|
||||
[2734404:MainThread](2026-02-26 22:19:46,550) INFO - qlib.timer - [log.py:117] - Time cost: 115.118s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:19:53,556) INFO - qlib.timer - [log.py:117] - Time cost: 581.918s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x71c5e0475d60>) Done
|
||||
[2734404:MainThread](2026-02-26 22:19:53,557) INFO - qlib.timer - [log.py:117] - Time cost: 581.920s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x71c5e0475d60>) Done
|
||||
[2734404:MainThread](2026-02-26 22:19:53,560) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:19:57,060) INFO - qlib.timer - [log.py:117] - Time cost: 3.499s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:19:58,834) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
KMID KLEN ... VSUMD30 VSUMD60
|
||||
datetime instrument ...
|
||||
2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
|
||||
SH600004 -0.013806 0.030012 ... -0.017610 0.039195
|
||||
SH600006 0.009238 0.016166 ... -0.034782 -0.014306
|
||||
SH600007 -0.014749 0.018879 ... -0.032427 0.034279
|
||||
SH600008 0.009259 0.024691 ... -0.063490 0.003978
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
|
||||
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
|
||||
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
|
||||
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
|
||||
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
|
||||
|
||||
[6908346 rows x 158 columns]
|
||||
[2734404:MainThread](2026-02-26 22:19:58,835) INFO - qlib.timer - [log.py:117] - Time cost: 5.276s | Fetching dataframe Done
|
||||
[2734404:MainThread](2026-02-26 22:19:59,042) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
|
||||
[2734404:MainThread](2026-02-26 22:20:16,326) INFO - qlib.timer - [log.py:117] - Time cost: 17.485s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:20:17,102) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:20:17,676) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,con_rating_strength from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
|
||||
[2734404:MainThread](2026-02-26 22:20:29,343) INFO - qlib.timer - [log.py:117] - Time cost: 11.676s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:20:30,245) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:21:55,033) INFO - qlib.timer - [log.py:117] - Time cost: 81.592s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:21:55,586) INFO - qlib.timer - [log.py:117] - Time cost: 116.751s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x71c7a492c4d0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:21:55,587) INFO - qlib.timer - [log.py:117] - Time cost: 116.752s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x71c7a492c4d0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:21:55,588) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:21:56,302) INFO - qlib.timer - [log.py:117] - Time cost: 0.713s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:21:56,309) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
turnover free_turnover log_size con_rating_strength
|
||||
datetime instrument
|
||||
2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
|
||||
SH600004 0.9386 1.9173 15.039255 0.8125
|
||||
SH600006 0.2566 0.6479 13.680836 1.0000
|
||||
SH600007 0.1647 0.8513 14.335590 0.7500
|
||||
SH600008 0.1813 0.3362 14.435625 0.6875
|
||||
... ... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
|
||||
SZ301662 12.5950 12.5950 12.681215 NaN
|
||||
SZ301665 14.0077 14.0077 11.719415 NaN
|
||||
SZ301678 6.6518 6.6518 12.799973 NaN
|
||||
SZ302132 1.3868 3.0296 15.359885 NaN
|
||||
|
||||
[7623242 rows x 4 columns]
|
||||
[2734404:MainThread](2026-02-26 22:21:56,310) INFO - qlib.timer - [log.py:117] - Time cost: 0.722s | Fetching dataframe Done
|
||||
[2734404:MainThread](2026-02-26 22:21:56,327) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
|
||||
[2734404:MainThread](2026-02-26 22:22:17,215) INFO - qlib.timer - [log.py:117] - Time cost: 20.899s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:22:17,952) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:22:18,463) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
|
||||
[2734404:MainThread](2026-02-26 22:22:38,963) INFO - qlib.timer - [log.py:117] - Time cost: 20.509s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:22:39,774) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:24:07,744) INFO - qlib.timer - [log.py:117] - Time cost: 84.654s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:24:08,702) INFO - qlib.timer - [log.py:117] - Time cost: 132.391s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x71c5e0847cb0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:24:08,703) INFO - qlib.timer - [log.py:117] - Time cost: 132.392s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x71c5e0847cb0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:24:08,704) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:24:09,549) INFO - qlib.timer - [log.py:117] - Time cost: 0.844s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:24:09,561) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
IsZt IsDt IsN ... open_stop close_stop high_stop
|
||||
datetime instrument ...
|
||||
2019-11-25 SH600000 False False False ... False False False
|
||||
SH600004 False False False ... False False False
|
||||
SH600006 False False False ... False False False
|
||||
SH600007 False False False ... False False False
|
||||
SH600008 False False False ... False False False
|
||||
... ... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False ... False False False
|
||||
SZ301662 False False False ... False False False
|
||||
SZ301665 False False False ... False False False
|
||||
SZ301678 False False False ... False False False
|
||||
SZ302132 False False False ... False False False
|
||||
|
||||
[6925320 rows x 12 columns]
|
||||
[2734404:MainThread](2026-02-26 22:24:09,562) INFO - qlib.timer - [log.py:117] - Time cost: 0.858s | Fetching dataframe Done
|
||||
[2734404:MainThread](2026-02-26 22:24:09,760) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
|
||||
[2734404:MainThread](2026-02-26 22:24:11,809) INFO - qlib.timer - [log.py:117] - Time cost: 2.238s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:24:11,822) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:25:28,259) INFO - qlib.timer - [log.py:117] - Time cost: 76.433s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x71c139b2af60>) Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x71c139b2af60>) Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,286) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,290) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,310) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 False False ... False False
|
||||
SH600004 False False ... False False
|
||||
SH600006 False False ... False False
|
||||
SH600007 False False ... False False
|
||||
SH600008 False False ... False False
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False ... False False
|
||||
SZ301662 False False ... False False
|
||||
SZ301665 False False ... False False
|
||||
SZ301678 False False ... False False
|
||||
SZ302132 False False ... False False
|
||||
|
||||
[41168 rows x 30 columns]
|
||||
[2734404:MainThread](2026-02-26 22:25:28,311) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,470) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
|
||||
[2734404:MainThread](2026-02-26 22:25:58,108) INFO - qlib.timer - [log.py:117] - Time cost: 29.791s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:25:58,818) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:27:21,291) INFO - qlib.timer - [log.py:117] - Time cost: 81.957s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:27:21,828) INFO - qlib.timer - [log.py:117] - Time cost: 113.516s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x71c5e0981ca0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:27:21,829) INFO - qlib.timer - [log.py:117] - Time cost: 113.517s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x71c5e0981ca0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:27:21,830) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:27:22,439) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
ST_Y ST_S ST_T ST_L ST_Z ST_X
|
||||
datetime instrument
|
||||
2019-11-25 SH600000 False False False False False False
|
||||
SH600004 False False False False False False
|
||||
SH600006 False False False False False False
|
||||
SH600007 False False False False False False
|
||||
SH600008 False False False False False False
|
||||
... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False False False False
|
||||
SZ301662 False False False False False False
|
||||
SZ301665 False False False False False False
|
||||
SZ301678 False False False False False False
|
||||
SZ302132 False False False False False False
|
||||
|
||||
[6925323 rows x 6 columns]
|
||||
[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.timer - [log.py:117] - Time cost: 0.618s | Fetching dataframe Done
|
||||
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
|
||||
group_list = [_df.resample("M", level="datetime")\
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e4d5d940>
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e4d5d940>
|
||||
Query config:
|
||||
#concepts: 2;
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c13b9b8b60>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c13b9b8b60>
|
||||
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c139b28aa0>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e07e8f20>
|
||||
[2734404:MainThread](2026-02-26 22:27:25,764) INFO - qlib.timer - [log.py:117] - Time cost: 3.315s | Concat index: Done
|
||||
[2734404:MainThread](2026-02-26 22:27:25,766) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
|
||||
[2734404:MainThread](2026-02-26 22:27:29,485) INFO - qlib.timer - [log.py:117] - Time cost: 3.718s | Creating SepDataFrame: Done
|
||||
[2734404:MainThread](2026-02-26 22:27:30,310) INFO - qlib.timer - [log.py:117] - Time cost: 1038.675s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x71c5e17ec230>) Done
|
||||
[2734404:MainThread](2026-02-26 22:27:30,311) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2734404:MainThread](2026-02-26 22:27:30,313) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2734404:MainThread](2026-02-26 22:27:30,318) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2734404:MainThread](2026-02-26 22:27:30,319) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2734404:MainThread](2026-02-26 22:27:30,322) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2734404:MainThread](2026-02-26 22:27:30,323) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2734404:MainThread](2026-02-26 22:27:30,326) INFO - qlib.timer - [log.py:117] - Time cost: 0.015s | fit & process data Done
|
||||
[2734404:MainThread](2026-02-26 22:27:30,327) INFO - qlib.timer - [log.py:117] - Time cost: 1038.692s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x71c5e17ec230>) Done
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'
|
||||
@ -1,321 +0,0 @@
|
||||
[2739486:MainThread](2026-02-26 22:59:30,849) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2739486:MainThread](2026-02-26 22:59:30,854) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2739486:MainThread](2026-02-26 22:59:30,855) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading data from Qlib pipeline...
|
||||
Loading since_date=2020-01-02
|
||||
Loading data with handler (load_start=2019-12-13)...
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
[2739486:MainThread](2026-02-26 22:59:30,878) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2739486:MainThread](2026-02-26 22:59:30,878) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2739486:MainThread](2026-02-26 22:59:30,938) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
[2739486:MainThread](2026-02-26 23:07:16,353) INFO - qlib.timer - [log.py:117] - Time cost: 465.464s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:07:17,149) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'module_path': 'qlib.contrib.data.agg_handler'},
|
||||
'load_end': datetime.date(2026, 2, 26),
|
||||
'load_start': Timestamp('2019-11-23 00:00:00'),
|
||||
'market': 'csiallx',
|
||||
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
|
||||
'region': 'cn'}}
|
||||
Query config:
|
||||
#alpha158: 1;
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71847694be90>
|
||||
[2739486:MainThread](2026-02-26 23:09:19,001) INFO - qlib.timer - [log.py:117] - Time cost: 112.707s | Instruments filter: Done
|
||||
[2739486:MainThread](2026-02-26 23:09:26,016) INFO - qlib.timer - [log.py:117] - Time cost: 595.133s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x7184769a5fa0>) Done
|
||||
[2739486:MainThread](2026-02-26 23:09:26,017) INFO - qlib.timer - [log.py:117] - Time cost: 595.135s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x7184769a5fa0>) Done
|
||||
[2739486:MainThread](2026-02-26 23:09:26,019) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2739486:MainThread](2026-02-26 23:09:29,432) INFO - qlib.timer - [log.py:117] - Time cost: 3.412s | fetch_df_by_index Done
|
||||
[2739486:MainThread](2026-02-26 23:09:31,228) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
KMID KLEN ... VSUMD30 VSUMD60
|
||||
datetime instrument ...
|
||||
2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
|
||||
SH600004 -0.013806 0.030012 ... -0.017610 0.039195
|
||||
SH600006 0.009238 0.016166 ... -0.034782 -0.014306
|
||||
SH600007 -0.014749 0.018879 ... -0.032427 0.034279
|
||||
SH600008 0.009259 0.024691 ... -0.063490 0.003978
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
|
||||
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
|
||||
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
|
||||
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
|
||||
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
|
||||
|
||||
[6908346 rows x 158 columns]
|
||||
[2739486:MainThread](2026-02-26 23:09:31,229) INFO - qlib.timer - [log.py:117] - Time cost: 5.211s | Fetching dataframe Done
|
||||
[2739486:MainThread](2026-02-26 23:09:31,242) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
|
||||
[2739486:MainThread](2026-02-26 23:09:54,142) INFO - qlib.timer - [log.py:117] - Time cost: 22.909s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:09:54,927) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:09:55,507) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,con_rating_strength from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
|
||||
[2739486:MainThread](2026-02-26 23:10:10,691) INFO - qlib.timer - [log.py:117] - Time cost: 15.192s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:10:11,588) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:11:37,528) INFO - qlib.timer - [log.py:117] - Time cost: 82.525s | Instruments filter: Done
|
||||
[2739486:MainThread](2026-02-26 23:11:38,259) INFO - qlib.timer - [log.py:117] - Time cost: 127.029s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7185777e3e90>) Done
|
||||
[2739486:MainThread](2026-02-26 23:11:38,260) INFO - qlib.timer - [log.py:117] - Time cost: 127.030s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7185777e3e90>) Done
|
||||
[2739486:MainThread](2026-02-26 23:11:38,261) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2739486:MainThread](2026-02-26 23:11:39,000) INFO - qlib.timer - [log.py:117] - Time cost: 0.738s | fetch_df_by_index Done
|
||||
[2739486:MainThread](2026-02-26 23:11:39,009) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
turnover free_turnover log_size con_rating_strength
|
||||
datetime instrument
|
||||
2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
|
||||
SH600004 0.9386 1.9173 15.039255 0.8125
|
||||
SH600006 0.2566 0.6479 13.680836 1.0000
|
||||
SH600007 0.1647 0.8513 14.335590 0.7500
|
||||
SH600008 0.1813 0.3362 14.435625 0.6875
|
||||
... ... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
|
||||
SZ301662 12.5950 12.5950 12.681215 1.0000
|
||||
SZ301665 14.0077 14.0077 11.719415 1.0000
|
||||
SZ301678 6.6518 6.6518 12.799973 0.7500
|
||||
SZ302132 1.3868 3.0296 15.359885 0.8750
|
||||
|
||||
[7623255 rows x 4 columns]
|
||||
[2739486:MainThread](2026-02-26 23:11:39,010) INFO - qlib.timer - [log.py:117] - Time cost: 0.749s | Fetching dataframe Done
|
||||
[2739486:MainThread](2026-02-26 23:11:39,191) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
|
||||
[2739486:MainThread](2026-02-26 23:12:05,839) INFO - qlib.timer - [log.py:117] - Time cost: 26.825s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:12:06,554) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:12:07,075) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
|
||||
[2739486:MainThread](2026-02-26 23:12:32,695) INFO - qlib.timer - [log.py:117] - Time cost: 25.629s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:12:33,566) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:14:02,232) INFO - qlib.timer - [log.py:117] - Time cost: 85.158s | Instruments filter: Done
|
||||
[2739486:MainThread](2026-02-26 23:14:03,155) INFO - qlib.timer - [log.py:117] - Time cost: 144.143s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x718478991880>) Done
|
||||
[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 144.144s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x718478991880>) Done
|
||||
[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2739486:MainThread](2026-02-26 23:14:04,046) INFO - qlib.timer - [log.py:117] - Time cost: 0.889s | fetch_df_by_index Done
|
||||
[2739486:MainThread](2026-02-26 23:14:04,060) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
IsZt IsDt IsN ... open_stop close_stop high_stop
|
||||
datetime instrument ...
|
||||
2019-11-25 SH600000 False False False ... False False False
|
||||
SH600004 False False False ... False False False
|
||||
SH600006 False False False ... False False False
|
||||
SH600007 False False False ... False False False
|
||||
SH600008 False False False ... False False False
|
||||
... ... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False ... False False False
|
||||
SZ301662 False False False ... False False False
|
||||
SZ301665 False False False ... False False False
|
||||
SZ301678 False False False ... False False False
|
||||
SZ302132 False False False ... False False False
|
||||
|
||||
[6925320 rows x 12 columns]
|
||||
[2739486:MainThread](2026-02-26 23:14:04,061) INFO - qlib.timer - [log.py:117] - Time cost: 0.904s | Fetching dataframe Done
|
||||
[2739486:MainThread](2026-02-26 23:14:04,079) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
|
||||
[2739486:MainThread](2026-02-26 23:14:06,440) INFO - qlib.timer - [log.py:117] - Time cost: 2.370s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:14:06,448) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:15:23,146) INFO - qlib.timer - [log.py:117] - Time cost: 76.695s | Instruments filter: Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,184) INFO - qlib.timer - [log.py:117] - Time cost: 79.120s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x7184782fa0c0>) Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,185) INFO - qlib.timer - [log.py:117] - Time cost: 79.121s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x7184782fa0c0>) Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,186) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,190) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 False False ... False False
|
||||
SH600004 False False ... False False
|
||||
SH600006 False False ... False False
|
||||
SH600007 False False ... False False
|
||||
SH600008 False False ... False False
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False ... False False
|
||||
SZ301662 False False ... False False
|
||||
SZ301665 False False ... False False
|
||||
SZ301678 False False ... False False
|
||||
SZ302132 False False ... False False
|
||||
|
||||
[41168 rows x 30 columns]
|
||||
[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,226) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
|
||||
[2739486:MainThread](2026-02-26 23:15:53,388) INFO - qlib.timer - [log.py:117] - Time cost: 30.171s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:15:54,166) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
@ -1,104 +0,0 @@
|
||||
[2745445:MainThread](2026-02-26 23:18:06,410) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2745445:MainThread](2026-02-26 23:18:06,414) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2745445:MainThread](2026-02-26 23:18:06,415) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: ../data/
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading raw data from Qlib pipeline...
|
||||
Loading raw data from handler (load_start=2019-12-13)...
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{[2745445:MainThread](2026-02-26 23:18:06,436) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2745445:MainThread](2026-02-26 23:18:06,437) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2745445:MainThread](2026-02-26 23:18:06,492) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
@ -1,103 +0,0 @@
|
||||
[2746177:MainThread](2026-02-26 23:21:56,618) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2746177:MainThread](2026-02-26 23:21:56,622) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2746177:MainThread](2026-02-26 23:21:56,623) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: ../data/
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading raw data from Qlib pipeline...
|
||||
Loading raw data from handler (load_start=2019-12-13)...
|
||||
Filtering instruments: ['SH600000', 'SH600004', 'SH600006', 'SH600007', 'SH600008']... (5 total)
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'}[2746177:MainThread](2026-02-26 23:21:56,647) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2746177:MainThread](2026-02-26 23:21:56,648) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2746177:MainThread](2026-02-26 23:21:56,716) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
@ -1,187 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Verify feature column order between standalone pipeline and qlib gold standard.
|
||||
|
||||
This script:
|
||||
1. Loads a small sample using the qlib pipeline
|
||||
2. Runs the same sample through the standalone generate_beta_embedding pipeline
|
||||
3. Compares the column order and feature values
|
||||
"""
|
||||
|
||||
import pickle as pkl
|
||||
import ruamel.yaml as yaml
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Patch yaml.safe_load for compatibility
|
||||
_yaml = yaml.YAML(typ='safe', pure=True)
|
||||
def patched_safe_load(stream):
|
||||
import io
|
||||
if isinstance(stream, str):
|
||||
stream = io.StringIO(stream)
|
||||
return _yaml.load(stream)
|
||||
yaml.safe_load = patched_safe_load
|
||||
|
||||
# Add scripts directory to path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'scripts'))
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("VERIFY FEATURE ORDER: Standalone vs Qlib Gold Standard")
|
||||
print("=" * 70)
|
||||
|
||||
# Step 1: Load processor list
|
||||
print("\nStep 1: Loading processor list...")
|
||||
proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
|
||||
with open(proc_path, "rb") as f:
|
||||
proc_list = pkl.load(f)
|
||||
print(f" Loaded {len(proc_list)} processors")
|
||||
|
||||
# Step 2: Load small sample from qlib pipeline
|
||||
print("\nStep 2: Loading sample from qlib pipeline...")
|
||||
|
||||
import qlib
|
||||
from qlib.config import REG_CN
|
||||
qlib.init(provider_uri='/home/guofu/.qlib/data_ops/target', region=REG_CN)
|
||||
|
||||
from qlib.workflow.cli import sys_config
|
||||
from qlib.utils import fill_placeholder
|
||||
import datetime as dt
|
||||
|
||||
yaml_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml"
|
||||
with open(yaml_path) as fin:
|
||||
config = yaml.safe_load(fin)
|
||||
|
||||
sys_config(config, "qlib.contrib.data.config")
|
||||
qlib.init(**config.get("qlib_init"))
|
||||
|
||||
load_start = pd.to_datetime("2020-01-02") - dt.timedelta(days=20)
|
||||
placehorder_value = {
|
||||
"<SINCE_DATE>": load_start,
|
||||
"<TODAY>": dt.date.today()
|
||||
}
|
||||
|
||||
config_filled = fill_placeholder(config, placehorder_value)
|
||||
handler = qlib.init_instance_by_config(config_filled["handler"])
|
||||
handler_data = handler._data
|
||||
|
||||
# Get data from SepDataFrame
|
||||
if hasattr(handler_data, '_data'):
|
||||
df_dict = handler_data._data
|
||||
print(f" Handler groups: {list(df_dict.keys())}")
|
||||
|
||||
# Concatenate groups
|
||||
raw_dfs = []
|
||||
for group, df in df_dict.items():
|
||||
df_copy = df.copy()
|
||||
df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
|
||||
raw_dfs.append(df_copy)
|
||||
print(f" {group}: {len(df_copy.columns)} columns")
|
||||
|
||||
raw_df = pd.concat(raw_dfs, axis=1)
|
||||
print(f" Raw concatenated shape: {raw_df.shape}")
|
||||
|
||||
# Step 3: Apply processors to get gold standard features
|
||||
print("\nStep 3: Applying processors (qlib gold standard)...")
|
||||
from qlib.contrib.data.utils import apply_proc_list
|
||||
|
||||
# Strip group prefixes for processor application
|
||||
col_mapping = {col: col.split('::', 1)[1] for col in raw_df.columns if '::' in col}
|
||||
raw_df_stripped = raw_df.rename(columns=col_mapping)
|
||||
|
||||
# Convert bool to object for processor compatibility
|
||||
bool_cols = raw_df_stripped.select_dtypes(include=['bool']).columns
|
||||
for col in bool_cols:
|
||||
raw_df_stripped[col] = raw_df_stripped[col].astype(object)
|
||||
|
||||
df_gold = apply_proc_list(raw_df_stripped, proc_list=proc_list, with_fit=False)
|
||||
print(f" Gold standard shape after processors: {df_gold.shape}")
|
||||
|
||||
# Restore group prefixes
|
||||
reverse_mapping = {v: k for k, v in col_mapping.items()}
|
||||
df_gold = df_gold.rename(columns=reverse_mapping)
|
||||
|
||||
# Get gold standard column order
|
||||
gold_columns = list(df_gold.columns)
|
||||
print(f"\nGold standard column groups:")
|
||||
|
||||
feature_cols = [c for c in gold_columns if c.startswith('feature::')]
|
||||
feature_ext_cols = [c for c in gold_columns if c.startswith('feature_ext::')]
|
||||
feature_flag_cols = [c for c in gold_columns if c.startswith('feature_flag::')]
|
||||
indus_idx_cols = [c for c in gold_columns if c.startswith('indus_idx::')]
|
||||
|
||||
print(f" feature:: {len(feature_cols)} cols")
|
||||
print(f" feature_ext:: {len(feature_ext_cols)} cols")
|
||||
print(f" feature_flag:: {len(feature_flag_cols)} cols")
|
||||
print(f" indus_idx:: {len(indus_idx_cols)} cols")
|
||||
|
||||
# Step 4: Now run standalone pipeline on same data
|
||||
print("\nStep 4: Running standalone pipeline...")
|
||||
|
||||
# Load parquet data for same date range
|
||||
from generate_beta_embedding import load_all_data, merge_data_sources, apply_feature_pipeline
|
||||
|
||||
df_alpha, df_kline, df_flag, df_industry = load_all_data("2020-01-02", "2020-01-10")
|
||||
df_standalone = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
|
||||
|
||||
print(f" Standalone loaded shape: {df_standalone.shape}")
|
||||
|
||||
# Apply feature pipeline
|
||||
df_processed, feature_cols_standalone = apply_feature_pipeline(df_standalone)
|
||||
print(f" Standalone processed shape: {df_processed.shape}")
|
||||
print(f" Standalone feature columns: {len(feature_cols_standalone)}")
|
||||
|
||||
# Step 5: Compare column counts
|
||||
print("\n" + "=" * 70)
|
||||
print("COMPARISON SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
print(f"\nGold standard total columns: {len(gold_columns)}")
|
||||
print(f" feature:: {len(feature_cols)}")
|
||||
print(f" feature_ext:: {len(feature_ext_cols)}")
|
||||
print(f" feature_flag:: {len(feature_flag_cols)}")
|
||||
print(f" indus_idx:: {len(indus_idx_cols)}")
|
||||
|
||||
print(f"\nStandalone feature columns: {len(feature_cols_standalone)}")
|
||||
|
||||
# The gold standard columns (without prefix) should match standalone
|
||||
gold_feature_cols = [c.split('::', 1)[1] for c in feature_cols]
|
||||
gold_feature_ext_cols = [c.split('::', 1)[1] for c in feature_ext_cols]
|
||||
gold_feature_flag_cols = [c.split('::', 1)[1] for c in feature_flag_cols]
|
||||
gold_indus_idx_cols = [c.split('::', 1)[1] for c in indus_idx_cols]
|
||||
|
||||
gold_all = gold_feature_cols + gold_feature_ext_cols + gold_feature_flag_cols + gold_indus_idx_cols
|
||||
|
||||
print(f"\nGold standard (flat): {len(gold_all)} features")
|
||||
print(f"Standalone: {len(feature_cols_standalone)} features")
|
||||
|
||||
if len(gold_all) != len(feature_cols_standalone):
|
||||
print(f"\nWARNING: Feature count mismatch! Difference: {len(gold_all) - len(feature_cols_standalone)}")
|
||||
|
||||
# Check column order
|
||||
print("\nFirst 20 column comparison:")
|
||||
print(f"{'Idx':<5} {'Gold Standard':<40} {'Standalone':<40} {'Match':<6}")
|
||||
print("-" * 90)
|
||||
for i in range(min(20, len(gold_all), len(feature_cols_standalone))):
|
||||
match = "✓" if gold_all[i] == feature_cols_standalone[i] else "✗"
|
||||
print(f"{i:<5} {gold_all[i]:<40} {feature_cols_standalone[i]:<40} {match:<6}")
|
||||
|
||||
# Check if orders match
|
||||
if gold_all == feature_cols_standalone:
|
||||
print("\n✓ Column order MATCHES!")
|
||||
else:
|
||||
print("\n✗ Column order DOES NOT MATCH!")
|
||||
print("\nFinding differences...")
|
||||
diff_count = 0
|
||||
for i in range(min(len(gold_all), len(feature_cols_standalone))):
|
||||
if gold_all[i] != feature_cols_standalone[i]:
|
||||
diff_count += 1
|
||||
if diff_count <= 20:
|
||||
print(f" [{i}] Gold: {gold_all[i]} vs Standalone: {feature_cols_standalone[i]}")
|
||||
print(f"Total differences: {diff_count}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in new issue