From 26a694298d4ccfc16ccad140d7ac9366950a1176 Mon Sep 17 00:00:00 2001 From: guofu Date: Sun, 1 Mar 2026 14:28:28 +0800 Subject: [PATCH] Clean obsolete debug files from alpha158_beta Remove bug analysis documentation (findings incorporated into README.md): - BUG_ANALYSIS.md, BUG_ANALYSIS_FINAL.md Remove one-off debug/exploration scripts: - compare_gold_standard.py, debug_data_divergence.py - verify_feature_order.py, regenerate_sample_embedding.py - dump_qlib_gold_standard.py, dump_qlib_gold_standard_simple.py Remove temporary log files and empty __pycache__ directories Co-Authored-By: Claude Opus 4.6 --- .gitignore | 18 +- compare_embeddings.py | 293 ------------ cta_1d/results/README.md | 18 + stock_15m/results/README.md | 18 + stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md | 123 ----- .../d033/alpha158_beta/BUG_ANALYSIS_FINAL.md | 159 ------- .../scripts/compare_gold_standard.py | 129 ------ .../scripts/debug_data_divergence.py | 254 ----------- .../scripts/dump_qlib_gold_standard.py | 421 ------------------ .../scripts/dump_qlib_gold_standard_simple.py | 270 ----------- .../scripts/regenerate_sample_embedding.py | 186 -------- stock_1d/d033/alpha158_beta/scripts/run.log | 394 ---------------- stock_1d/d033/alpha158_beta/scripts/run2.log | 373 ---------------- stock_1d/d033/alpha158_beta/scripts/run3.log | 373 ---------------- stock_1d/d033/alpha158_beta/scripts/run4.log | 321 ------------- .../d033/alpha158_beta/scripts/run_simple.log | 104 ----- .../alpha158_beta/scripts/run_simple2.log | 103 ----- .../scripts/verify_feature_order.py | 187 -------- 18 files changed, 51 insertions(+), 3693 deletions(-) delete mode 100644 compare_embeddings.py create mode 100644 cta_1d/results/README.md create mode 100644 stock_15m/results/README.md delete mode 100644 stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md delete mode 100644 stock_1d/d033/alpha158_beta/BUG_ANALYSIS_FINAL.md delete mode 100644 stock_1d/d033/alpha158_beta/scripts/compare_gold_standard.py delete mode 100644 stock_1d/d033/alpha158_beta/scripts/debug_data_divergence.py delete mode 100644 stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard.py delete mode 100644 stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard_simple.py delete mode 100644 stock_1d/d033/alpha158_beta/scripts/regenerate_sample_embedding.py delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run.log delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run2.log delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run3.log delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run4.log delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run_simple.log delete mode 100644 stock_1d/d033/alpha158_beta/scripts/run_simple2.log delete mode 100644 stock_1d/d033/alpha158_beta/scripts/verify_feature_order.py diff --git a/.gitignore b/.gitignore index dd4c38b..5378947 100644 --- a/.gitignore +++ b/.gitignore @@ -31,9 +31,21 @@ wheels/ *.ipynb_checkpoints # Results and data -results/* -!results/*/.gitkeep -!results/*/README.md +cta_1d/results/* +!cta_1d/results/.gitkeep +!cta_1d/results/README.md +!cta_1d/results/*/.gitkeep +!cta_1d/results/*/README.md +stock_15m/results/* +!stock_15m/results/.gitkeep +!stock_15m/results/README.md +!stock_15m/results/*/.gitkeep +!stock_15m/results/*/README.md +stock_1d/results/* +!stock_1d/results/.gitkeep +!stock_1d/results/README.md +!stock_1d/results/*/.gitkeep +!stock_1d/results/*/README.md *.parquet *.pkl *.h5 diff --git a/compare_embeddings.py b/compare_embeddings.py deleted file mode 100644 index ace827f..0000000 --- a/compare_embeddings.py +++ /dev/null @@ -1,293 +0,0 @@ -#!/usr/bin/env python3 -""" -Compare generated embeddings with database embeddings (0_7 version). -Handles format conversion for datetime and instrument columns. - -SUMMARY OF FINDINGS: -- Generated embeddings and database embeddings have DIFFERENT values -- Instrument mapping: 430xxx -> SHxxxxx, 830xxx -> SZxxxxx, 6xxxxx -> SH6xxxxx -- Correlation between corresponding dimensions: ~0.0067 (essentially zero) -- The generated embeddings are NOT the same as the database 0_7 embeddings -- Possible reasons: - 1. Different model weights/versions used for generation - 2. Different input features or normalization - 3. Different random seed or inference configuration -""" -import polars as pl -import numpy as np -from pathlib import Path - -def instrument_int_to_code(inst_int: int) -> str: - """Convert integer instrument code to exchange-prefixed string. - - The encoding in the embedding file uses: - - 4xxxxx -> SHxxxxxx (Shanghai A-shares, but code mapping is non-trivial) - - 8xxxxx -> SZxxxxxx (Shenzhen A-shares) - - Direct 6-digit codes are also present (600xxx, 000xxx, 300xxx) - - Note: The exact mapping from 430017 -> SH600021 requires the original - features file. We attempt an approximate mapping here. - """ - inst_str = str(inst_int) - - # Already 6-digit code - if len(inst_str) == 6 and inst_str[0] not in ('4', '8'): - if inst_str.startswith('6'): - return f"SH{inst_str}" - else: - return f"SZ{inst_str}" - - # 6-digit with exchange prefix (4=SH, 8=SZ) - if len(inst_str) == 6 and inst_str[0] in ('4', '8'): - exchange = 'SH' if inst_str[0] == '4' else 'SZ' - # The mapping from 430xxx -> 600xxx is not 1:1 - # Return the code as-is for matching attempts - return f"{exchange}{inst_str[1:]}" - - return inst_str - -def load_generated_embedding(date_int: int, sample_n: int = None): - """Load generated embedding for a specific date.""" - gen_path = Path('/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/embedding_0_7_beta.parquet') - - lf = pl.scan_parquet(gen_path) - lf = lf.filter(pl.col('datetime') == date_int) - - if sample_n: - lf = lf.head(sample_n) - - df = lf.collect() - - # Convert wide format (embedding_0, embedding_1, ...) to list format - embedding_cols = [c for c in df.columns if c.startswith('embedding_')] - embedding_cols.sort(key=lambda x: int(x.split('_')[1])) - - embedding_structs = df.select(embedding_cols).to_struct() - embeddings_list = [[v for v in struct.values()] for struct in embedding_structs] - - df = df.with_columns([ - pl.Series('values', embeddings_list), - pl.col('datetime').cast(pl.UInt32).alias('datetime_uint32'), - pl.col('instrument').alias('instrument_orig'), - pl.col('instrument').cast(pl.String).alias('instrument_str'), - pl.col('instrument').map_elements(instrument_int_to_code, return_dtype=pl.String).alias('instrument_code') - ]) - - return df - -def load_database_embedding(date_str: str): - """Load database embedding for a specific date.""" - db_path = Path(f'/data/parquet/dataset/dwm_1day_multicast_csencode_1D/version=csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/datetime={date_str}/0.parquet') - - if not db_path.exists(): - return None - - df = pl.read_parquet(db_path) - df = df.with_columns([ - pl.col('datetime').cast(pl.Int64).alias('datetime_int') - ]) - return df - -def analyze_instrument_mapping(date_int: int): - """Analyze the instrument mapping between generated and database embeddings.""" - date_str = str(date_int) - - print(f"\n{'='*80}") - print(f"Analyzing instrument mapping for date: {date_int}") - print(f"{'='*80}") - - gen_df = load_generated_embedding(date_int) - db_df = load_database_embedding(date_str) - - if db_df is None: - print(f"ERROR: Database embedding not found for {date_str}") - return - - print(f"\nGenerated embeddings: {gen_df.shape[0]} rows") - print(f"Database embeddings: {db_df.shape[0]} rows") - - # Show samples - print("\n--- Generated Embedding Sample ---") - sample_gen = gen_df.select(['datetime', 'instrument_orig', 'instrument_str', 'instrument_code', 'values']).head(10) - print(sample_gen) - - print("\n--- Database Embedding Sample ---") - print(db_df.head(10)) - - # Try different matching strategies - gen_insts_set = set(gen_df['instrument_code'].to_list()) - db_insts_set = set(db_df['instrument'].to_list()) - - common = gen_insts_set & db_insts_set - gen_only = gen_insts_set - db_insts_set - db_only = db_insts_set - gen_insts_set - - print(f"\n--- Matching Results (with code conversion) ---") - print(f"Common instruments: {len(common)}") - print(f"Generated only: {len(gen_only)}") - print(f"Database only: {len(db_only)}") - - if len(common) == 0: - print("\nNo common instruments found with code conversion!") - print("\nTrying to find mapping patterns...") - - # Show some samples for analysis - print("\nGenerated instrument samples (original, converted):") - gen_samples = list(zip(gen_df['instrument_orig'].head(20).to_list(), - gen_df['instrument_code'].head(20).to_list())) - for orig, conv in gen_samples: - print(f" {orig} -> {conv}") - - print("\nDatabase instrument samples:") - db_samples = db_df['instrument'].head(20).to_list() - for inst in db_samples: - print(f" {inst}") - - # Check if there's a position-based alignment possible - # Sort both and compare by position - gen_sorted = sorted(gen_df['instrument_orig'].to_list()) - db_sorted = sorted([int(inst[2:]) for inst in db_df['instrument'].to_list()]) - - print("\n--- Attempting position-based matching ---") - print(f"Generated sorted (first 10): {gen_sorted[:10]}") - print(f"Database sorted (first 10): {db_sorted[:10]}") - - else: - # We have matches, compare embeddings - print(f"\n--- Comparing embeddings for {len(common)} common instruments ---") - - gen_common = gen_df.filter(pl.col('instrument_code').is_in(list(common))) - db_common = db_df.filter(pl.col('instrument').is_in(list(common))) - - # Join and compare - comparison = gen_common.join( - db_common, - left_on='instrument_code', - right_on='instrument', - how='inner', - suffix='_db' - ) - - # Calculate differences - diffs = [] - for row in comparison.iter_rows(): - # Find indices for the values columns - gen_vals_idx = comparison.columns.index('values') - db_vals_idx = comparison.columns.index('values_db') - - gen_emb = np.array(row[gen_vals_idx]) - db_emb = np.array(row[db_vals_idx]) - - diff = gen_emb - db_emb - diff_norm = np.linalg.norm(diff) - rel_diff = diff_norm / (np.linalg.norm(db_emb) + 1e-10) - - diffs.append({ - 'instrument': row[comparison.columns.index('instrument_code')], - 'l2_norm_diff': diff_norm, - 'relative_diff': rel_diff, - 'max_abs_diff': np.max(np.abs(diff)), - 'gen_emb_norm': np.linalg.norm(gen_emb), - 'db_emb_norm': np.linalg.norm(db_emb) - }) - - if diffs: - diff_df = pl.DataFrame(diffs) - print("\nDifference statistics:") - print(diff_df.select(['l2_norm_diff', 'relative_diff', 'max_abs_diff']).describe()) - - max_rel_diff = diff_df['relative_diff'].max() - print(f"\nMax relative difference: {max_rel_diff:.6e}") - - if max_rel_diff < 1e-5: - print("✓ Embeddings match within numerical precision!") - elif max_rel_diff < 0.01: - print("~ Embeddings are very similar") - else: - print("✗ Embeddings differ significantly") - - # Show some comparison samples - print("\nSample comparison:") - for i in range(min(5, len(diffs))): - d = diffs[i] - print(f" {d['instrument']}: gen_norm={d['gen_emb_norm']:.4f}, " - f"db_norm={d['db_emb_norm']:.4f}, rel_diff={d['relative_diff']:.6e}") - -def calculate_correlation(date_int: int): - """Calculate correlation between generated and database embeddings.""" - import numpy as np - - date_str = str(date_int) - - print(f"\n{'='*80}") - print(f"Correlation Analysis for date: {date_int}") - print(f"{'='*80}") - - gen_df = load_generated_embedding(date_int) - db_df = load_database_embedding(date_str) - - if db_df is None: - print(f"ERROR: Database embedding not found for {date_str}") - return - - # Find common instruments - gen_insts = set(gen_df['instrument_code'].to_list()) - db_insts = set(db_df['instrument'].to_list()) - common = list(gen_insts & db_insts) - - print(f"\nCommon instruments: {len(common)}") - - if len(common) == 0: - print("No common instruments found!") - return - - # Filter to common and sort - gen_common = gen_df.filter(pl.col('instrument_code').is_in(common)).sort('instrument_code') - db_common = db_df.filter(pl.col('instrument').is_in(common)).sort('instrument') - - # Extract embedding matrices - gen_embs = np.array(gen_common['values'].to_list()) - db_embs = np.array(db_common['values'].to_list()) - - print(f"Generated embeddings shape: {gen_embs.shape}") - print(f"Database embeddings shape: {db_embs.shape}") - - # Calculate correlation per dimension - correlations = [] - for i in range(32): - gen_dim = gen_embs[:, i] - db_dim = db_embs[:, i] - corr = np.corrcoef(gen_dim, db_dim)[0, 1] - correlations.append(corr) - - print(f"\nCorrelation statistics across 32 dimensions:") - print(f" Mean: {np.mean(correlations):.4f}") - print(f" Median: {np.median(correlations):.4f}") - print(f" Min: {np.min(correlations):.4f}") - print(f" Max: {np.max(correlations):.4f}") - - # Overall correlation - overall_corr = np.corrcoef(gen_embs.flatten(), db_embs.flatten())[0, 1] - print(f"\nOverall correlation (all dims flattened): {overall_corr:.4f}") - - # Interpretation - mean_corr = np.mean(correlations) - if abs(mean_corr) < 0.1: - print("\n✗ CONCLUSION: Embeddings are NOT correlated (essentially independent)") - elif abs(mean_corr) < 0.5: - print("\n~ CONCLUSION: Weak correlation between embeddings") - else: - print(f"\n✓ CONCLUSION: {'Strong' if abs(mean_corr) > 0.8 else 'Moderate'} correlation") - -if __name__ == '__main__': - # Analyze for a few dates - dates_to_compare = [20190102, 20200102, 20240102] - - for date in dates_to_compare: - try: - analyze_instrument_mapping(date) - calculate_correlation(date) - except Exception as e: - print(f"\nError analyzing date {date}: {e}") - import traceback - traceback.print_exc() diff --git a/cta_1d/results/README.md b/cta_1d/results/README.md new file mode 100644 index 0000000..e5bc268 --- /dev/null +++ b/cta_1d/results/README.md @@ -0,0 +1,18 @@ +# CTA 1D Experiment Results + +Document experiments manually here. + +## Template + +```markdown +## YYYY-MM-DD: Experiment Name +- Notebook: `../cta_1d/XX_notebook.ipynb` (cell range) +- Data: [dates] +- Config: key parameters +- Metrics: IC mean/std, returns, sharpe +- Notes: observations, next steps +``` + +## Experiments + +*Add entries below as you run experiments* diff --git a/stock_15m/results/README.md b/stock_15m/results/README.md new file mode 100644 index 0000000..ba201f2 --- /dev/null +++ b/stock_15m/results/README.md @@ -0,0 +1,18 @@ +# Stock 15m Experiment Results + +Document experiments manually here. + +## Template + +```markdown +## YYYY-MM-DD: Experiment Name +- Notebook: `../stock_15m/XX_notebook.ipynb` (cell range) +- Data: [dates] +- Config: key parameters +- Metrics: IC mean/std, returns, sharpe +- Notes: observations, next steps +``` + +## Experiments + +*Add entries below as you run experiments* diff --git a/stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md b/stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md deleted file mode 100644 index 5ebda83..0000000 --- a/stock_1d/d033/alpha158_beta/BUG_ANALYSIS.md +++ /dev/null @@ -1,123 +0,0 @@ -# Data Pipeline Bug Analysis - -## Summary - -The generated embeddings do not match the database 0_7 embeddings due to multiple bugs in the data pipeline migration from qlib to standalone Polars implementation. - ---- - -## Bugs Fixed - -### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED - -**Original (incorrect):** -```python -market_0 = (instrument >= 600000) # SH -market_1 = (instrument < 600000) # SZ -``` - -**Fixed:** -```python -inst_str = str(instrument).zfill(6) -market_0 = inst_str.startswith('6') # SH: 6xxxxx -market_1 = inst_str.startswith('0') | inst_str.startswith('3') # SZ: 0xxx, 3xxx -market_2 = inst_str.startswith('4') | inst_str.startswith('8') # NE: 4xxx, 8xxx -``` - -**Impact:** 167 instruments (4xxxxx, 8xxxxx - 新三板) were misclassified. - ---- - -### 2. ColumnRemover Missing `IsN` ✓ FIXED - -**Original (incorrect):** -```python -columns_to_remove = ['TotalValue_diff', 'IsZt', 'IsDt'] -``` - -**Fixed:** -```python -columns_to_remove = ['TotalValue_diff', 'IsN', 'IsZt', 'IsDt'] -``` - -**Impact:** Extra column caused feature dimension mismatch. - ---- - -### 3. RobustZScoreNorm Applied to Wrong Columns ✓ FIXED - -**Original (incorrect):** -Applied normalization to ALL 341 features including market flags and indus_idx. - -**Fixed:** -Only normalize `alpha158 + alpha158_ntrl + market_ext + market_ext_ntrl` (330 features), excluding: -- Market flags (Limit, Stopping, IsTp, IsXD, IsXR, IsDR, market_0, market_1, market_2, IsST) -- indus_idx - ---- - -## Critical Remaining Issue: Data Schema Mismatch - -### `Limit` and `Stopping` Column Types Changed - -**Original qlib pipeline expected:** -- `Limit`: **Boolean** flag (True = limit up) -- `Stopping`: **Boolean** flag (True = suspended trading) - -**Current Parquet data has:** -- `Limit`: **Float64** price change percentage (0.0 to 1301.3) -- `Stopping`: **Float64** price change percentage - -**Evidence:** -``` -Limit values sample: [8.86, 9.36, 31.0, 7.32, 2.28, 6.39, 5.38, 4.03, 3.86, 9.89] -Limit == 0: only 2 rows -Limit > 0: 3738 rows -``` - -This is a **fundamental data schema change**. The current Parquet files contain different data than what the original VAE model was trained on. - -**Possible fixes:** -1. Convert `Limit` and `Stopping` to boolean flags using a threshold -2. Find the original data source that had boolean flags -3. Re-train the VAE model with the new data schema - ---- - -## Correlation Results - -After fixing bugs 1-3, the embedding correlation with database 0_7: - -| Metric | Value | -|--------|-------| -| Mean correlation (32 dims) | 0.0068 | -| Median correlation | 0.0094 | -| Overall correlation | 0.2330 | - -**Conclusion:** Embeddings remain essentially uncorrelated (≈0). - ---- - -## Root Cause - -The **Limit/Stopping data schema change** is the most likely root cause. The VAE model learned to encode features that included binary limit/stopping flags, but the standalone pipeline feeds it continuous price change percentages instead. - ---- - -## Next Steps - -1. **Verify original data schema:** - - Check if the original DolphinDB table had boolean `Limit` and `Stopping` columns - - Compare with the current Parquet schema - -2. **Fix the data loading:** - - Either convert continuous values to binary flags - - Or use the correct boolean columns (`IsZt`, `IsDt`) for limit flags - -3. **Verify feature order:** - - Ensure the qlib RobustZScoreNorm parameters are applied in the correct order - - Check that `[alpha158, alpha158_ntrl, market_ext, market_ext_ntrl]` matches the 330-parameter shape - -4. **Re-run comparison:** - - Generate new embeddings with the corrected pipeline - - Compare correlation with database diff --git a/stock_1d/d033/alpha158_beta/BUG_ANALYSIS_FINAL.md b/stock_1d/d033/alpha158_beta/BUG_ANALYSIS_FINAL.md deleted file mode 100644 index d9439e9..0000000 --- a/stock_1d/d033/alpha158_beta/BUG_ANALYSIS_FINAL.md +++ /dev/null @@ -1,159 +0,0 @@ -# Data Pipeline Bug Analysis - Final Status - -## Summary - -After fixing all identified bugs, the feature count now matches (341), but the embeddings remain uncorrelated with the database 0_7 version. - -**Latest Version**: v6 -- Feature count: 341 ✓ (matches VAE input dim) -- Mean correlation with DB: 0.0050 (essentially zero) -- Status: All identified bugs fixed, IsST issue documented -- **New**: Polars-based dataset generation script added (`scripts/dump_polars_dataset.py`) - ---- - -## Bugs Fixed - -### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED -- **Bug**: Used `instrument >= 600000` which misclassified 新三板 instruments -- **Fix**: Use string prefix matching with vocab_size=2 (not 3) -- **Impact**: 167 instruments corrected - -### 2. ColumnRemover Missing `IsN` ✓ FIXED -- **Bug**: Only removed `IsZt, IsDt` but not `IsN` -- **Fix**: Added `IsN` to removal list -- **Impact**: Feature count alignment - -### 3. RobustZScoreNorm Scope ✓ FIXED -- **Bug**: Applied normalization to all 341 features -- **Fix**: Only normalize 330 features (alpha158 + market_ext, both original + neutralized) -- **Impact**: Correct normalization scope - -### 4. Wrong Data Sources for Market Flags ✓ FIXED -- **Bug**: Used `Limit, Stopping` (Float64) from kline_adjusted -- **Fix**: Load from correct sources: - - kline_adjusted: `IsZt, IsDt, IsN, IsXD, IsXR, IsDR` (Boolean) - - market_flag: `open_limit, close_limit, low_limit, high_stop` (Boolean, 4 cols) -- **Impact**: Correct boolean flag data - -### 5. Feature Count Mismatch ✓ FIXED -- **Bug**: 344 features (3 extra) -- **Fix**: vocab_size=2 + 4 market_flag cols = 341 features -- **Impact**: VAE input dimension matches - -### 6. Fixed* Processors Not Adding Required Columns ✓ FIXED -- **Bug**: `FixedFlagMarketInjector` only converted dtype but didn't add `market_0`, `market_1` columns -- **Bug**: `FixedFlagSTInjector` only converted dtype but didn't create `IsST` column from `ST_S`, `ST_Y` -- **Fix**: - - `FixedFlagMarketInjector`: Now adds `market_0` (SH60xxx, SZ00xxx) and `market_1` (SH688xxx, SH689xxx, SZ300xxx, SZ301xxx) - - `FixedFlagSTInjector`: Now creates `IsST = ST_S | ST_Y` -- **Impact**: Processed data now has 408 columns (was 405), matching original qlib output - ---- - -## Important Discovery: IsST Column Issue in Gold-Standard Code - -### Problem Description - -The `FlagSTInjector` processor in the original qlib proc_list is supposed to create an `IsST` column in the `feature_flag` group from the `ST_S` and `ST_Y` columns in the `st_flag` group. However, this processor **fails silently** even in the gold-standard qlib code. - -### Root Cause - -The `FlagSTInjector` processor attempts to access columns using a format that doesn't match the actual column structure in the data: - -1. **Expected format**: The processor expects columns like `st_flag::ST_S` and `st_flag::ST_Y` (string format with `::` separator) -2. **Actual format**: The qlib handler produces MultiIndex tuple columns like `('st_flag', 'ST_S')` and `('st_flag', 'ST_Y')` - -This format mismatch causes the processor to fail to find the ST flag columns, and thus no `IsST` column is created. - -### Evidence - -```python -# Check proc_list -import pickle as pkl -with open('proc_list.proc', 'rb') as f: - proc_list = pkl.load(f) - -# FlagSTInjector config -flag_st = proc_list[2] -print(f"fields_group: {flag_st.fields_group}") # 'feature_flag' -print(f"col_name: {flag_st.col_name}") # 'IsST' -print(f"st_group: {flag_st.st_group}") # 'st_flag' - -# Check if IsST exists in processed data -with open('processed_data.pkl', 'rb') as f: - df = pkl.load(f) - -feature_flag_cols = [c[1] for c in df.columns if c[0] == 'feature_flag'] -print('IsST' in feature_flag_cols) # False! -``` - -### Impact - -- **VAE training**: The VAE model was trained on data **without** the `IsST` column -- **VAE input dimension**: 341 features (excluding IsST), not 342 -- **Polars pipeline**: Should also skip `IsST` to maintain compatibility - -### Resolution - -The polars-based pipeline (`dump_polars_dataset.py`) now correctly **skips** the `FlagSTInjector` step to match the gold-standard behavior: - -```python -# Step 3: FlagSTInjector - SKIPPED (fails even in gold-standard) -print("[3] Skipping FlagSTInjector (as per gold-standard behavior)...") -market_flag_with_st = market_flag_with_market # No IsST added -``` - -### Lessons Learned - -1. **Verify processor execution**: Don't assume all processors in the proc_list executed successfully. Check the output data to verify expected columns exist. - -2. **Column format matters**: The qlib processors were designed for specific column formats (MultiIndex tuples vs `::` separator strings). Format mismatches can cause silent failures. - -3. **Match the gold-standard bugs**: When replicating a pipeline, sometimes you need to replicate the bugs too. The VAE was trained on data without `IsST`, so our pipeline must also exclude it. - -4. **Debug by comparing intermediate outputs**: Use scripts like `debug_data_divergence.py` to compare raw and processed data between the gold-standard and polars pipelines. - ---- - -## Correlation Results (v5) - -| Metric | Value | -|--------|-------| -| Mean correlation (32 dims) | 0.0050 | -| Median correlation | 0.0079 | -| Min | -0.0420 | -| Max | 0.0372 | -| Overall (flattened) | 0.2225 | - -**Conclusion**: Embeddings remain essentially uncorrelated with database. - ---- - -## Possible Remaining Issues - -1. **Different input data values**: The alpha158_0_7_beta Parquet files may contain different values than the original DolphinDB data used to train the VAE. - -2. **Feature ordering mismatch**: The 330 RobustZScoreNorm parameters must be applied in the exact order: - - [0:158] = alpha158 original - - [158:316] = alpha158_ntrl - - [316:323] = market_ext original (7 cols) - - [323:330] = market_ext_ntrl (7 cols) - -3. **Industry neutralization differences**: Our `IndusNtrlInjector` implementation may differ from qlib's. - -4. **Missing transformations**: There may be additional preprocessing steps not captured in handler.yaml. - -5. **VAE model mismatch**: The VAE model may have been trained with different data than what handler.yaml specifies. - ---- - -## Recommended Next Steps - -1. **Compare intermediate features**: Run both the qlib pipeline and our pipeline on the same input data and compare outputs at each step. - -2. **Verify RobustZScoreNorm parameter order**: Check if our feature ordering matches the order used during VAE training. - -3. **Compare predictions, not embeddings**: Instead of comparing VAE embeddings, compare the final d033 model predictions with the original 0_7 predictions. - -4. **Check alpha158 data source**: Verify that `stg_1day_wind_alpha158_0_7_beta_1D` contains the same data as the original DolphinDB `stg_1day_wind_alpha158_0_7_beta` table. diff --git a/stock_1d/d033/alpha158_beta/scripts/compare_gold_standard.py b/stock_1d/d033/alpha158_beta/scripts/compare_gold_standard.py deleted file mode 100644 index 22a539b..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/compare_gold_standard.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python -""" -Compare generated embeddings with gold standard embeddings from DolphinDB. -""" - -import polars as pl -import numpy as np -from pathlib import Path - -DATA_DIR = Path(__file__).parent / "../data" - - -def compare_embeddings(): - """Compare generated and gold standard embeddings.""" - - # Load data - gold_path = DATA_DIR / "embedding_0_7_beta_gold_standard.parquet" - gen_path = DATA_DIR / "embedding_0_7_beta_sample.parquet" - - print("=" * 60) - print("Loading embeddings") - print("=" * 60) - - gold = pl.read_parquet(gold_path) - gen = pl.read_parquet(gen_path) - - print(f"Gold standard: {gold.shape}") - print(f"Generated: {gen.shape}") - - # Get embedding columns - emb_cols = [f"embedding_{i}" for i in range(32)] - - # Compare by date - dates = sorted(gold["datetime"].unique().to_list()) - - print("\n" + "=" * 60) - print("Comparison by date") - print("=" * 60) - - for dt in dates: - gold_dt = gold.filter(pl.col("datetime") == dt) - gen_dt = gen.filter(pl.col("datetime") == dt) - - print(f"\nDate: {dt}") - print(f" Gold instruments: {gold_dt.height}, Generated instruments: {gen_dt.height}") - print(f" Gold instrument sample: {gold_dt['instrument'].head(5).to_list()}") - print(f" Gen instrument sample: {gen_dt['instrument'].head(5).to_list()}") - - # Check for common instruments - gold_insts = set(gold_dt["instrument"].to_list()) - gen_insts = set(gen_dt["instrument"].to_list()) - common = gold_insts & gen_insts - - print(f" Common instruments: {len(common)}") - - if len(common) > 0: - # Compare embeddings for common instruments - gold_common = gold_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument") - gen_common = gen_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument") - - # Calculate embedding differences - diffs = [] - for i in range(len(gold_common)): - gold_emb = np.array([gold_common[col][i] for col in emb_cols]) - gen_emb = np.array([gen_common[col][i] for col in emb_cols]) - - diff = gold_emb - gen_emb - l2_norm = np.linalg.norm(diff) - rel_diff = l2_norm / (np.linalg.norm(gold_emb) + 1e-8) - max_abs_diff = np.max(np.abs(diff)) - - diffs.append({ - "l2_norm": l2_norm, - "rel_diff": rel_diff, - "max_abs_diff": max_abs_diff, - "gold_norm": np.linalg.norm(gold_emb), - "gen_norm": np.linalg.norm(gen_emb) - }) - - diff_df = pl.DataFrame(diffs) - print(f"\n Embedding comparison:") - print(f" Mean L2 norm diff: {diff_df['l2_norm'].mean():.4f}") - print(f" Mean rel diff: {diff_df['rel_diff'].mean():.4%}") - print(f" Mean max abs diff: {diff_df['max_abs_diff'].mean():.4f}") - print(f" Gold emb norm (mean): {diff_df['gold_norm'].mean():.4f}") - print(f" Gen emb norm (mean): {diff_df['gen_norm'].mean():.4f}") - - # Correlation analysis - gold_embs = np.array([[gold_common[col][i] for col in emb_cols] for i in range(len(gold_common))]) - gen_embs = np.array([[gen_common[col][i] for col in emb_cols] for i in range(len(gen_common))]) - - correlations = [] - for d in range(32): - corr = np.corrcoef(gold_embs[:, d], gen_embs[:, d])[0, 1] - correlations.append(corr) - - print(f"\n Correlation by dimension:") - print(f" Mean: {np.mean(correlations):.4f}") - print(f" Median: {np.median(correlations):.4f}") - print(f" Min: {np.min(correlations):.4f}") - print(f" Max: {np.max(correlations):.4f}") - - # Overall correlation - overall_corr = np.corrcoef(gold_embs.flatten(), gen_embs.flatten())[0, 1] - print(f" Overall (flattened): {overall_corr:.4f}") - - print("\n" + "=" * 60) - print("Summary Statistics") - print("=" * 60) - - # Gold standard stats - gold_embs = gold.select(emb_cols).to_numpy() - print("\nGold standard embeddings:") - print(f" Mean: {np.mean(gold_embs):.6f}") - print(f" Std: {np.std(gold_embs):.6f}") - print(f" Min: {np.min(gold_embs):.6f}") - print(f" Max: {np.max(gold_embs):.6f}") - - # Generated stats - gen_embs = gen.select(emb_cols).to_numpy() - print("\nGenerated embeddings:") - print(f" Mean: {np.mean(gen_embs):.6f}") - print(f" Std: {np.std(gen_embs):.6f}") - print(f" Min: {np.min(gen_embs):.6f}") - print(f" Max: {np.max(gen_embs):.6f}") - - -if __name__ == "__main__": - compare_embeddings() diff --git a/stock_1d/d033/alpha158_beta/scripts/debug_data_divergence.py b/stock_1d/d033/alpha158_beta/scripts/debug_data_divergence.py deleted file mode 100644 index 5d6372c..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/debug_data_divergence.py +++ /dev/null @@ -1,254 +0,0 @@ -#!/usr/bin/env python -""" -Debug script to compare gold-standard qlib data vs polars-based pipeline. - -This script helps identify where the data loading and processing pipeline -starts to diverge from the gold-standard qlib output. -""" - -import os -import sys -import pickle as pkl -import numpy as np -import pandas as pd -import polars as pl -from pathlib import Path - -# Paths -GOLD_RAW_PATH = "/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/raw_data_20190101_20190131.pkl" -GOLD_PROC_PATH = "/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/processed_data_20190101_20190131.pkl" -PROC_LIST_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc" - -sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) - -def compare_raw_data(): - """Compare raw data from gold standard vs polars pipeline.""" - print("=" * 80) - print("STEP 1: Compare RAW DATA (before proc_list)") - print("=" * 80) - - # Load gold standard raw data - with open(GOLD_RAW_PATH, "rb") as f: - gold_raw = pkl.load(f) - - print(f"\nGold standard raw data:") - print(f" Shape: {gold_raw.shape}") - print(f" Index: {gold_raw.index.names}") - print(f" Column groups: {gold_raw.columns.get_level_values(0).unique().tolist()}") - - # Count columns per group - for grp in gold_raw.columns.get_level_values(0).unique().tolist(): - count = (gold_raw.columns.get_level_values(0) == grp).sum() - print(f" {grp}: {count} columns") - - # Show sample values for key columns - print("\n Sample values (first 3 rows):") - for col in [('feature', 'KMID'), ('feature_ext', 'turnover'), ('feature_ext', 'log_size')]: - if col in gold_raw.columns: - print(f" {col}: {gold_raw[col].iloc[:3].tolist()}") - - return gold_raw - - -def compare_processed_data(): - """Compare processed data from gold standard vs polars pipeline.""" - print("\n" + "=" * 80) - print("STEP 2: Compare PROCESSED DATA (after proc_list)") - print("=" * 80) - - # Load gold standard processed data - with open(GOLD_PROC_PATH, "rb") as f: - gold_proc = pkl.load(f) - - print(f"\nGold standard processed data:") - print(f" Shape: {gold_proc.shape}") - print(f" Index: {gold_proc.index.names}") - print(f" Column groups: {gold_proc.columns.get_level_values(0).unique().tolist()}") - - # Count columns per group - for grp in gold_proc.columns.get_level_values(0).unique().tolist(): - count = (gold_proc.columns.get_level_values(0) == grp).sum() - print(f" {grp}: {count} columns") - - # Show sample values for key columns - print("\n Sample values (first 3 rows):") - for col in [('feature', 'KMID'), ('feature', 'KMID_ntrl'), - ('feature_ext', 'turnover'), ('feature_ext', 'turnover_ntrl')]: - if col in gold_proc.columns: - print(f" {col}: {gold_proc[col].iloc[:3].tolist()}") - - return gold_proc - - -def analyze_processor_pipeline(gold_raw, gold_proc): - """Analyze what transformations happened in the proc_list.""" - print("\n" + "=" * 80) - print("STEP 3: Analyze Processor Transformations") - print("=" * 80) - - # Load proc_list - with open(PROC_LIST_PATH, "rb") as f: - proc_list = pkl.load(f) - - print(f"\nProcessor pipeline ({len(proc_list)} processors):") - for i, proc in enumerate(proc_list): - print(f" [{i}] {type(proc).__name__}") - - # Analyze column changes - print("\nColumn count changes:") - print(f" Before: {gold_raw.shape[1]} columns") - print(f" After: {gold_proc.shape[1]} columns") - print(f" Change: +{gold_proc.shape[1] - gold_raw.shape[1]} columns") - - # Check which columns were added/removed - gold_raw_cols = set(gold_raw.columns) - gold_proc_cols = set(gold_proc.columns) - - added_cols = gold_proc_cols - gold_raw_cols - removed_cols = gold_raw_cols - gold_proc_cols - - print(f"\n Added columns: {len(added_cols)}") - print(f" Removed columns: {len(removed_cols)}") - - if removed_cols: - print(f" Removed: {list(removed_cols)[:10]}...") - - # Check feature column patterns - print("\nFeature column patterns in processed data:") - feature_cols = [c for c in gold_proc.columns if c[0] == 'feature'] - ntrl_cols = [c for c in feature_cols if c[1].endswith('_ntrl')] - raw_cols = [c for c in feature_cols if not c[1].endswith('_ntrl')] - print(f" Total feature columns: {len(feature_cols)}") - print(f" _ntrl columns: {len(ntrl_cols)}") - print(f" raw columns: {len(raw_cols)}") - - -def check_polars_pipeline(): - """Run the polars-based pipeline and compare.""" - print("\n" + "=" * 80) - print("STEP 4: Generate data using Polars pipeline") - print("=" * 80) - - try: - from generate_beta_embedding import ( - load_all_data, merge_data_sources, apply_feature_pipeline, - filter_stock_universe - ) - - # Load data using polars pipeline - print("\nLoading data with polars pipeline...") - df_alpha, df_kline, df_flag, df_industry = load_all_data( - "2019-01-01", "2019-01-31" - ) - - print(f"\nPolars data sources loaded:") - print(f" Alpha158: {df_alpha.shape}") - print(f" Kline (market_ext): {df_kline.shape}") - print(f" Flags: {df_flag.shape}") - print(f" Industry: {df_industry.shape}") - - # Merge - df_merged = merge_data_sources(df_alpha, df_kline, df_flag, df_industry) - print(f"\nAfter merge: {df_merged.shape}") - - # Convert to pandas for easier comparison - df_pandas = df_merged.to_pandas() - df_pandas = df_pandas.set_index(['datetime', 'instrument']) - - print(f"\nAfter converting to pandas MultiIndex: {df_pandas.shape}") - - # Compare column names - with open(GOLD_RAW_PATH, "rb") as f: - gold_raw = pkl.load(f) - - print("\n" + "=" * 80) - print("STEP 5: Compare Column Names (Gold vs Polars)") - print("=" * 80) - - gold_cols = set(str(c) for c in gold_raw.columns) - polars_cols = set(str(c) for c in df_pandas.columns) - - common_cols = gold_cols & polars_cols - only_in_gold = gold_cols - polars_cols - only_in_polars = polars_cols - gold_cols - - print(f"\n Common columns: {len(common_cols)}") - print(f" Only in gold standard: {len(only_in_gold)}") - print(f" Only in polars: {len(only_in_polars)}") - - if only_in_gold: - print(f"\n Columns only in gold standard (first 20):") - for col in list(only_in_gold)[:20]: - print(f" {col}") - - if only_in_polars: - print(f"\n Columns only in polars (first 20):") - for col in list(only_in_polars)[:20]: - print(f" {col}") - - # Check common columns values - print("\n" + "=" * 80) - print("STEP 6: Compare Values for Common Columns") - print("=" * 80) - - # Get common columns as tuples - common_tuples = [] - for gc in gold_raw.columns: - gc_str = str(gc) - for pc in df_pandas.columns: - if str(pc) == gc_str: - common_tuples.append((gc, pc)) - break - - print(f"\nComparing {len(common_tuples)} common columns...") - - # Compare first few columns - matching_count = 0 - diff_count = 0 - for i, (gc, pc) in enumerate(common_tuples[:20]): - gold_vals = gold_raw[gc].dropna().values - polars_vals = df_pandas[pc].dropna().values - - if len(gold_vals) > 0 and len(polars_vals) > 0: - # Compare min, max, mean - if np.allclose([gold_vals.min(), gold_vals.max(), gold_vals.mean()], - [polars_vals.min(), polars_vals.max(), polars_vals.mean()], - rtol=1e-5): - matching_count += 1 - else: - diff_count += 1 - if diff_count <= 3: - print(f" DIFF: {gc}") - print(f" Gold: min={gold_vals.min():.6f}, max={gold_vals.max():.6f}, mean={gold_vals.mean():.6f}") - print(f" Polars: min={polars_vals.min():.6f}, max={polars_vals.max():.6f}, mean={polars_vals.mean():.6f}") - - print(f"\n Matching columns: {matching_count}") - print(f" Different columns: {diff_count}") - - except Exception as e: - print(f"\nError running polars pipeline: {e}") - import traceback - traceback.print_exc() - - -if __name__ == "__main__": - print("=" * 80) - print("DATA DIVERGENCE DEBUG SCRIPT") - print("Comparing gold-standard qlib output vs polars-based pipeline") - print("=" * 80) - - # Step 1: Check raw data - gold_raw = compare_raw_data() - - # Step 2: Check processed data - gold_proc = compare_processed_data() - - # Step 3: Analyze processor transformations - analyze_processor_pipeline(gold_raw, gold_proc) - - # Step 4 & 5: Run polars pipeline and compare - check_polars_pipeline() - - print("\n" + "=" * 80) - print("DEBUG COMPLETE") - print("=" * 80) diff --git a/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard.py b/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard.py deleted file mode 100644 index f311500..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard.py +++ /dev/null @@ -1,421 +0,0 @@ -#!/usr/bin/env python -""" -Dump Gold-Standard Data from Qlib Pipeline - -This script exports processed feature data from the original Qlib pipeline -in multiple formats for debugging and comparison with the standalone Polars implementation. - -Usage: - python dump_qlib_gold_standard.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir ../data/ -""" - -import argparse -import os -import sys -import pickle as pkl -from datetime import datetime, timedelta -from pathlib import Path - -import pandas as pd -import polars as pl -import numpy as np - -# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan -if not hasattr(np, 'NaN'): - np.NaN = np.nan - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Dump gold-standard data from Qlib pipeline" - ) - parser.add_argument( - "--start-date", - type=str, - default="2020-01-02", - help="Start date for data export (YYYY-MM-DD)", - ) - parser.add_argument( - "--end-date", - type=str, - default="2020-01-10", - help="End date for data export (YYYY-MM-DD)", - ) - parser.add_argument( - "--output-dir", - type=str, - default="../data/", - help="Output directory for exported files", - ) - parser.add_argument( - "--qlib-dataset-path", - type=str, - default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/", - help="Path to Qlib dataset module", - ) - return parser.parse_args() - - -def load_qlib_data(qlib_dataset_path, since_date): - """ - Load processed data from Qlib pipeline. - - This function loads data using the original Qlib pipeline and handles - the SepDataFrame return type by concatenating column groups. - - Args: - qlib_dataset_path: Path to the Qlib dataset module - since_date: Start date for loading data (YYYY-MM-DD) - - Returns: - pd.DataFrame: Processed DataFrame from Qlib pipeline with all column groups concatenated - """ - import importlib.util - import datetime as dt - - # Patch ruamel.yaml to provide safe_load compatibility - import ruamel.yaml as yaml - - # Create a YAML instance with safe loader for backward compatibility - _yaml = yaml.YAML(typ='safe', pure=True) - - # Monkey-patch safe_load to use the new API - def patched_safe_load(stream): - import io - if isinstance(stream, str): - stream = io.StringIO(stream) - return _yaml.load(stream) - - yaml.safe_load = patched_safe_load - - # Load the module directly - spec = importlib.util.spec_from_file_location( - "qlib_dataset", - os.path.join(qlib_dataset_path, "__init__.py") - ) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - - # Parse since_date - since_date_dt = pd.to_datetime(since_date) - # Load with extra history for Diff processor - load_start = (since_date_dt - dt.timedelta(days=20)).strftime("%Y-%m-%d") - - print(f" Loading data with handler (load_start={load_start})...") - - # Use _load_from_yaml to get raw handler data (SepDataFrame) - handler_data = module._load_from_yaml( - os.path.join(qlib_dataset_path, "handler.yaml"), - load_start - ) - - # Handle SepDataFrame - extract and concatenate column groups - if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'): - # It's a SepDataFrame from AggHandler - df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {}) - group_names = list(df_dict.keys()) - print(f" Handler returned SepDataFrame with groups: {group_names}") - - # Concatenate all column groups into a single DataFrame - all_dfs = [] - for group in group_names: - df = df_dict[group] - if df is not None and len(df.columns) > 0: - df_copy = df.copy() - # Add group prefix to columns - df_copy.columns = [f"{group}::{col}" for col in df_copy.columns] - all_dfs.append(df_copy) - print(f" Group '{group}': {df_copy.shape}") - - # Concatenate all groups along axis 1 - raw_df = pd.concat(all_dfs, axis=1) - print(f" Concatenated raw data shape: {raw_df.shape}") - else: - raw_df = handler_data - print(f" Raw data shape: {raw_df.shape}") - - # Load processor list - proc_path = os.path.join(qlib_dataset_path, "proc_list.proc") - print(f" Loading processor list from: {proc_path}") - with open(proc_path, "rb") as f: - proc_list = pkl.load(f) - print(f" Processor list has {len(proc_list)} processors") - for i, proc in enumerate(proc_list): - print(f" {i+1}. {type(proc).__name__}") - - # Apply processors - from qlib.contrib.data.utils import apply_proc_list - print(f" Applying processor list (with_fit=False)...") - - # The processor list expects columns without the group prefix - # We need to strip the prefix before applying processors - # Create a mapping and restore original column names - col_mapping = {} - for col in raw_df.columns: - if '::' in col: - original = col.split('::', 1)[1] - col_mapping[col] = original - - # Rename columns back to original names for processor application - raw_df_renamed = raw_df.rename(columns=col_mapping) - print(f" Renamed columns for processor compatibility. Shape: {raw_df_renamed.shape}") - - # Convert boolean columns to object to avoid NaN -> int conversion issues - bool_cols = raw_df_renamed.select_dtypes(include=['bool']).columns - print(f" Converting {len(bool_cols)} boolean columns to object dtype") - for col in bool_cols: - raw_df_renamed[col] = raw_df_renamed[col].astype(object) - - # Apply processors - df = apply_proc_list(raw_df_renamed, proc_list=proc_list, with_fit=False) - print(f" Applied processor list. Result shape: {df.shape}") - - # Add back group prefixes to columns - new_col_mapping = {v: k for k, v in col_mapping.items()} - df = df.rename(columns=new_col_mapping) - print(f" Restored column group prefixes. Shape: {df.shape}") - - # Filter to requested date range - df = df.loc(axis=0)[slice(since_date_dt, None)] - print(f" Filtered to since_date={since_date}. Final shape: {df.shape}") - - return df - - -def export_column_groups(df, output_dir, prefix="gold_standard"): - """ - Export separate files for different column groups. - - Column groups: - - feature: alpha158 + alpha158_ntrl - - feature_ext: extended features (log_size_diff, etc.) - - feature_flag: market flags (IsST, IsN, IsZt, IsDt, etc.) - - indus_idx: industry index columns - """ - # Identify column groups based on naming conventions - feature_cols = [c for c in df.columns if c.startswith("feature::")] - feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")] - feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")] - indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")] - - # Also include the ntrl suffixed columns - feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")] - - export_paths = {} - - # Export feature columns (alpha158 + alpha158_ntrl) - if feature_cols: - feature_df = df[feature_cols] - path = os.path.join(output_dir, f"{prefix}_feature.parquet") - feature_df.to_parquet(path) - export_paths["feature"] = path - print(f" Exported feature columns ({len(feature_cols)}): {path}") - - # Export feature_ext columns - if feature_ext_cols: - feature_ext_df = df[feature_ext_cols] - path = os.path.join(output_dir, f"{prefix}_feature_ext.parquet") - feature_ext_df.to_parquet(path) - export_paths["feature_ext"] = path - print(f" Exported feature_ext columns ({len(feature_ext_cols)}): {path}") - - # Export feature_flag columns - if feature_flag_cols: - feature_flag_df = df[feature_flag_cols] - path = os.path.join(output_dir, f"{prefix}_feature_flag.parquet") - feature_flag_df.to_parquet(path) - export_paths["feature_flag"] = path - print(f" Exported feature_flag columns ({len(feature_flag_cols)}): {path}") - - # Export indus_idx columns - if indus_idx_cols: - indus_idx_df = df[indus_idx_cols] - path = os.path.join(output_dir, f"{prefix}_indus_idx.parquet") - indus_idx_df.to_parquet(path) - export_paths["indus_idx"] = path - print(f" Exported indus_idx columns ({len(indus_idx_cols)}): {path}") - - # Export feature_ntrl columns separately - if feature_ntrl_cols: - feature_ntrl_df = df[feature_ntrl_cols] - path = os.path.join(output_dir, f"{prefix}_feature_ntrl.parquet") - feature_ntrl_df.to_parquet(path) - export_paths["feature_ntrl"] = path - print(f" Exported feature_ntrl columns ({len(feature_ntrl_cols)}): {path}") - - return export_paths - - -def export_metadata(df, output_dir, prefix="gold_standard", proc_list_path=None): - """ - Export metadata about the dataset. - - Includes: - - Column names and shapes - - Processor list configuration - - Date range coverage - - NaN value statistics - """ - metadata_path = os.path.join(output_dir, f"{prefix}_metadata.txt") - - with open(metadata_path, "w") as f: - f.write("=" * 80 + "\n") - f.write("GOLD-STANDARD QLIB PIPELINE OUTPUT - METADATA\n") - f.write("=" * 80 + "\n\n") - - f.write(f"Export Date: {datetime.now().isoformat()}\n\n") - - f.write("DATAFRAME SHAPE\n") - f.write("-" * 40 + "\n") - f.write(f"Shape: {df.shape}\n") - f.write(f"Rows: {len(df)}\n") - f.write(f"Columns: {len(df.columns)}\n\n") - - f.write("DATE RANGE\n") - f.write("-" * 40 + "\n") - dates = df.index.get_level_values("datetime").unique() - f.write(f"Min Date: {dates.min()}\n") - f.write(f"Max Date: {dates.max()}\n") - f.write(f"Unique Dates: {len(dates)}\n\n") - - f.write("INSTRUMENTS\n") - f.write("-" * 40 + "\n") - instruments = df.index.get_level_values("instrument").unique() - f.write(f"Unique Instruments: {len(instruments)}\n") - f.write(f"Sample Instruments: {list(instruments[:10])}\n\n") - - f.write("COLUMN GROUPS\n") - f.write("-" * 40 + "\n") - - # Categorize columns - feature_cols = [c for c in df.columns if c.startswith("feature::")] - feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")] - feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")] - indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")] - feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")] - - f.write(f"feature:: columns: {len(feature_cols)}\n") - f.write(f"feature_ext:: columns: {len(feature_ext_cols)}\n") - f.write(f"feature_flag:: columns: {len(feature_flag_cols)}\n") - f.write(f"indus_idx:: columns: {len(indus_idx_cols)}\n") - f.write(f"*_ntrl columns: {len(feature_ntrl_cols)}\n\n") - - f.write("COLUMN DTYPES\n") - f.write("-" * 40 + "\n") - dtype_counts = df.dtypes.value_counts() - for dtype, count in dtype_counts.items(): - f.write(f"{dtype}: {count}\n") - f.write("\n") - - f.write("NAN STATISTICS\n") - f.write("-" * 40 + "\n") - nan_counts = df.isna().sum() - cols_with_nan = nan_counts[nan_counts > 0] - f.write(f"Columns with NaN: {len(cols_with_nan)}\n") - f.write(f"Total NaN values: {df.isna().sum().sum()}\n\n") - - if len(cols_with_nan) > 0: - f.write("NaN per column (top 20):\n") - for col, cnt in cols_with_nan.nlargest(20).items(): - f.write(f" {col}: {cnt} ({100*cnt/len(df):.2f}%)\n") - f.write("\n") - - f.write("ALL COLUMN NAMES\n") - f.write("-" * 40 + "\n") - for i, col in enumerate(df.columns): - f.write(f" {i+1}. {col}\n") - f.write("\n") - - if proc_list_path and os.path.exists(proc_list_path): - f.write("PROCESSOR LIST\n") - f.write("-" * 40 + "\n") - f.write(f"Source: {proc_list_path}\n") - try: - with open(proc_list_path, "rb") as pf: - proc_list = pkl.load(pf) - f.write(f"Number of processors: {len(proc_list)}\n\n") - for i, proc in enumerate(proc_list): - f.write(f" {i+1}. {proc}\n") - except Exception as e: - f.write(f"Could not load processor list: {e}\n") - f.write("\n") - - print(f"Exported metadata: {metadata_path}") - return metadata_path - - -def main(): - args = parse_args() - - # Parse dates - start_date = pd.to_datetime(args.start_date) - end_date = pd.to_datetime(args.end_date) - - # Create output directory if it doesn't exist - output_dir = Path(args.output_dir).resolve() - output_dir.mkdir(parents=True, exist_ok=True) - - print("=" * 80) - print("DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE") - print("=" * 80) - print(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}") - print(f"Output Directory: {output_dir}") - print(f"Qlib Dataset Path: {args.qlib_dataset_path}") - print() - - # Load data from Qlib pipeline - print("Step 1: Loading data from Qlib pipeline...") - print(f" Loading since_date={start_date.strftime('%Y-%m-%d')}") - - try: - df = load_qlib_data(args.qlib_dataset_path, start_date.strftime("%Y-%m-%d")) - print(f" Loaded DataFrame with shape: {df.shape}") - except Exception as e: - print(f" ERROR: Failed to load data from Qlib pipeline: {e}") - sys.exit(1) - - # Filter to requested date range - print("\nStep 2: Filtering to requested date range...") - df = df.loc(axis=0)[slice(start_date, end_date)] - print(f" Filtered shape: {df.shape}") - - # Export full DataFrame - print("\nStep 3: Exporting full DataFrame...") - prefix = f"gold_standard_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}" - - parquet_path = output_dir / f"{prefix}.parquet" - df.to_parquet(parquet_path) - print(f" Exported parquet: {parquet_path}") - - pkl_path = output_dir / f"{prefix}.pkl" - df.to_pickle(pkl_path) - print(f" Exported pickle: {pkl_path}") - - # Export column groups - print("\nStep 4: Exporting column groups...") - export_paths = export_column_groups(df, str(output_dir), prefix=prefix) - - # Export metadata - print("\nStep 5: Exporting metadata...") - proc_list_path = os.path.join(args.qlib_dataset_path, "proc_list.proc") - export_metadata(df, str(output_dir), prefix=prefix, proc_list_path=proc_list_path) - - # Summary - print("\n" + "=" * 80) - print("EXPORT SUMMARY") - print("=" * 80) - print(f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}") - print(f"Output directory: {output_dir}") - print(f"Total rows: {len(df)}") - print(f"Total columns: {len(df.columns)}") - print(f"\nFiles exported:") - print(f" - {prefix}.parquet (full DataFrame)") - print(f" - {prefix}.pkl (pickle, preserves dtypes)") - print(f" - {prefix}_metadata.txt (column info, statistics)") - for group, path in export_paths.items(): - print(f" - {os.path.basename(path)} ({group} columns)") - print("\nDone!") - - -if __name__ == "__main__": - main() diff --git a/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard_simple.py b/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard_simple.py deleted file mode 100644 index d475d30..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/dump_qlib_gold_standard_simple.py +++ /dev/null @@ -1,270 +0,0 @@ -#!/usr/bin/env python -""" -Dump Gold-Standard Data from Qlib Pipeline (Simple Version) - -This script exports the RAW feature data from the Qlib pipeline BEFORE -any processors are applied. This is useful for debugging and comparison. - -NOTE: This script loads ALL data from DolphinDB and then filters to the -requested date range. For large date ranges, this may require significant memory. - -Usage: - python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10 -""" - -import argparse -import os -import sys -import pickle as pkl -from datetime import datetime, timedelta -from pathlib import Path - -import pandas as pd -import numpy as np - -# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan -if not hasattr(np, 'NaN'): - np.NaN = np.nan - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Dump gold-standard raw data from Qlib pipeline", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=""" -Examples: - # Export a few days for debugging (recommended) - python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10 - - # Export with custom output directory - python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir /path/to/output - """ - ) - parser.add_argument( - "--start-date", - type=str, - default="2020-01-02", - help="Start date for data export (YYYY-MM-DD)", - ) - parser.add_argument( - "--end-date", - type=str, - default="2020-01-10", - help="End date for data export (YYYY-MM-DD)", - ) - parser.add_argument( - "--output-dir", - type=str, - default="../data/", - help="Output directory for exported files", - ) - parser.add_argument( - "--qlib-dataset-path", - type=str, - default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/", - help="Path to Qlib dataset module", - ) - parser.add_argument( - "--instruments", - type=str, - default=None, - help="Comma-separated list of instrument codes to export (default: all)", - ) - return parser.parse_args() - - -def load_raw_data(qlib_dataset_path, since_date, instruments=None): - """ - Load RAW data from Qlib pipeline (before processor list is applied). - - Returns a dict of DataFrames, one per column group. - - Args: - qlib_dataset_path: Path to Qlib dataset module - since_date: Start date for loading (needs history before for Diff) - instruments: Optional list of instrument codes to filter - """ - import importlib.util - import ruamel.yaml as yaml - - # Create a YAML instance with safe loader for backward compatibility - _yaml = yaml.YAML(typ='safe', pure=True) - - def patched_safe_load(stream): - import io - if isinstance(stream, str): - stream = io.StringIO(stream) - return _yaml.load(stream) - - yaml.safe_load = patched_safe_load - - # Load the module directly - spec = importlib.util.spec_from_file_location( - "qlib_dataset", - os.path.join(qlib_dataset_path, "__init__.py") - ) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - - # Parse since_date - since_date_dt = pd.to_datetime(since_date) - # Load with extra history for Diff processor - load_start = (since_date_dt - timedelta(days=20)).strftime("%Y-%m-%d") - - print(f" Loading raw data from handler (load_start={load_start})...") - if instruments: - print(f" Filtering instruments: {instruments[:5]}... ({len(instruments)} total)") - - # Use _load_from_yaml to get raw handler data (SepDataFrame) - handler_data = module._load_from_yaml( - os.path.join(qlib_dataset_path, "handler.yaml"), - load_start - ) - - # Handle SepDataFrame - extract column groups - if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'): - df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {}) - group_names = list(df_dict.keys()) - print(f" Handler returned SepDataFrame with groups: {group_names}") - - # Filter instruments if specified - if instruments: - print(f" Filtering to specified instruments...") - for group in group_names: - if df_dict[group] is not None: - df = df_dict[group] - # Filter by instrument level - if isinstance(df.index, pd.MultiIndex): - mask = df.index.get_level_values('instrument').isin(instruments) - df_dict[group] = df[mask] - print(f" Group '{group}': {df_dict[group].shape} (filtered)") - - for group in group_names: - df = df_dict[group] - if df is not None: - print(f" Group '{group}': shape={df.shape}, columns={len(df.columns)}") - - return df_dict, handler_data.index - else: - print(f" Handler returned DataFrame: shape={handler_data.shape}") - return {"default": handler_data}, handler_data.index - - -def export_data(df_dict, index, output_dir, start_date, end_date): - """Export data to parquet and pickle files.""" - output_dir = Path(output_dir).resolve() - output_dir.mkdir(parents=True, exist_ok=True) - - start_date = pd.to_datetime(start_date) - end_date = pd.to_datetime(end_date) - - # Filter index - mask = (index >= start_date) & (index <= end_date) - filtered_index = index[mask] - - print(f"\nExporting data for date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}") - print(f" Filtered index has {len(filtered_index)} dates") - - prefix = f"gold_standard_raw_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}" - - exported_files = [] - - # Export each group separately - for group, df in df_dict.items(): - if df is None or len(df.columns) == 0: - print(f" Skipping empty group '{group}'") - continue - - # Filter by date - df_filtered = df.loc[df.index.isin(filtered_index)] - print(f" Group '{group}': {df_filtered.shape}") - - # Export to parquet - parquet_path = output_dir / f"{prefix}_{group}.parquet" - df_filtered.to_parquet(parquet_path) - exported_files.append(str(parquet_path)) - print(f" -> {parquet_path}") - - # Export to pickle (preserves dtypes) - pkl_path = output_dir / f"{prefix}_{group}.pkl" - df_filtered.to_pickle(pkl_path) - exported_files.append(str(pkl_path)) - - # Also create a metadata file - metadata_path = output_dir / f"{prefix}_metadata.txt" - with open(metadata_path, "w") as f: - f.write("=" * 80 + "\n") - f.write("GOLD-STANDARD RAW DATA - METADATA\n") - f.write("=" * 80 + "\n\n") - f.write(f"Export Date: {datetime.now().isoformat()}\n") - f.write(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\n") - f.write(f"Total Dates: {len(filtered_index)}\n\n") - - f.write("COLUMN GROUPS:\n") - f.write("-" * 40 + "\n") - for group, df in df_dict.items(): - if df is not None: - f.write(f" {group}:\n") - f.write(f" Shape: {df.shape}\n") - f.write(f" Columns: {len(df.columns)}\n") - f.write(f" Sample columns: {list(df.columns[:5])}...\n\n") - - f.write("\nPROCESSOR LIST (for reference):\n") - f.write("-" * 40 + "\n") - proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc" - if os.path.exists(proc_path): - with open(proc_path, "rb") as pf: - proc_list = pkl.load(pf) - f.write(f"Number of processors: {len(proc_list)}\n\n") - for i, proc in enumerate(proc_list): - f.write(f" {i+1}. {type(proc).__module__}.{type(proc).__name__}\n") - else: - f.write(f"Processor list not found: {proc_path}\n") - - exported_files.append(str(metadata_path)) - - return exported_files - - -def main(): - args = parse_args() - - print("=" * 80) - print("DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE") - print("=" * 80) - print(f"Date Range: {args.start_date} to {args.end_date}") - print(f"Output Directory: {args.output_dir}") - print(f"Qlib Dataset Path: {args.qlib_dataset_path}") - print() - - # Load raw data - print("Step 1: Loading raw data from Qlib pipeline...") - try: - instruments = None - if args.instruments: - instruments = args.instruments.split(',') - df_dict, index = load_raw_data(args.qlib_dataset_path, args.start_date, instruments=instruments) - except Exception as e: - print(f" ERROR: Failed to load data: {e}") - import traceback - traceback.print_exc() - sys.exit(1) - - # Export data - print("\nStep 2: Exporting data...") - exported_files = export_data(df_dict, index, args.output_dir, args.start_date, args.end_date) - - # Summary - print("\n" + "=" * 80) - print("EXPORT SUMMARY") - print("=" * 80) - print(f"Date range: {args.start_date} to {args.end_date}") - print(f"Output directory: {Path(args.output_dir).resolve()}") - print(f"\nFiles exported ({len(exported_files)}):") - for f in exported_files: - print(f" - {f}") - print("\nDone!") - - -if __name__ == "__main__": - main() diff --git a/stock_1d/d033/alpha158_beta/scripts/regenerate_sample_embedding.py b/stock_1d/d033/alpha158_beta/scripts/regenerate_sample_embedding.py deleted file mode 100644 index 1ffe4d5..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/regenerate_sample_embedding.py +++ /dev/null @@ -1,186 +0,0 @@ -#!/usr/bin/env python -""" -Regenerate beta embeddings for a few days of sample data. - -This script generates embeddings for a small date range to test the pipeline. -""" - -import os -import sys -import pickle as pkl -import numpy as np -import polars as pl -import torch -import torch.nn as nn -from pathlib import Path -from datetime import datetime -from typing import List, Dict, Optional - -# Add parent directory to path -sys.path.insert(0, str(Path(__file__).parent)) - -# Import from the main generate script -from generate_beta_embedding import ( - load_all_data, - merge_data_sources, - apply_feature_pipeline, - prepare_vae_features, - load_vae_model, - encode_with_vae, - load_qlib_processor_params, - VAE_INPUT_DIM, - OUTPUT_DIR, -) - -# Sample dates for testing (5 consecutive trading days) -SAMPLE_DATES = [ - "2019-01-02", - "2019-01-03", - "2019-01-04", - "2019-01-07", - "2019-01-08", -] - -VAE_MODEL_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/model/csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/module.pt" - - -def generate_sample_embeddings( - dates: List[str] = SAMPLE_DATES, - output_file: str = "embedding_0_7_beta_sample.parquet", - use_vae: bool = True -) -> pl.DataFrame: - """ - Generate embeddings for a sample of dates. - - Args: - dates: List of dates in YYYY-MM-DD format - output_file: Output parquet file path - use_vae: Whether to use VAE for encoding (or random embeddings) - """ - start_date = dates[0] - end_date = dates[-1] - - print("=" * 60) - print("Generating Sample Beta Embeddings") - print(f"Dates: {dates}") - print(f"Use VAE: {use_vae}") - print("=" * 60) - - # Load all data sources - df_alpha, df_kline, df_flag, df_industry = load_all_data(start_date, end_date) - - print(f"\nLoaded data:") - print(f" Alpha158: {df_alpha.shape}") - print(f" Kline: {df_kline.shape}") - print(f" Flags: {df_flag.shape}") - print(f" Industry: {df_industry.shape}") - - # Filter to only the sample dates - date_ints = [int(d.replace("-", "")) for d in dates] - df_alpha = df_alpha.filter(pl.col("datetime").is_in(date_ints)) - df_kline = df_kline.filter(pl.col("datetime").is_in(date_ints)) - df_flag = df_flag.filter(pl.col("datetime").is_in(date_ints)) - df_industry = df_industry.filter(pl.col("datetime").is_in(date_ints)) - - print(f"\nAfter filtering to sample dates:") - print(f" Alpha158: {df_alpha.shape}") - print(f" Kline: {df_kline.shape}") - print(f" Flags: {df_flag.shape}") - print(f" Industry: {df_industry.shape}") - - # Merge data sources - df = merge_data_sources(df_alpha, df_kline, df_flag, df_industry) - print(f"\nMerged data shape: {df.shape}") - - # Save datetime and instrument before processing - datetime_col = df["datetime"].clone() - instrument_col = df["instrument"].clone() - - # Apply feature transformation pipeline - df_processed, feature_cols, norm_feature_cols, market_flag_for_vae = apply_feature_pipeline(df) - - # Prepare features for VAE - features = prepare_vae_features( - df_processed, feature_cols, - norm_feature_cols=norm_feature_cols, - market_flag_for_vae=market_flag_for_vae - ) - - print(f"\nFeature matrix shape: {features.shape}") - - # Encode with VAE - if use_vae: - try: - model = load_vae_model(VAE_MODEL_PATH) - embeddings = encode_with_vae(features, model) - print(f"\nVAE encoding successful!") - except Exception as e: - print(f"\nVAE encoding failed: {e}") - import traceback - traceback.print_exc() - print("\nFalling back to random embeddings...") - np.random.seed(42) - embeddings = np.random.randn(features.shape[0], 32).astype(np.float32) - else: - print("\nUsing random embeddings (VAE disabled)...") - np.random.seed(42) - embeddings = np.random.randn(features.shape[0], 32).astype(np.float32) - - # Create output DataFrame - embedding_cols = [f"embedding_{i}" for i in range(embeddings.shape[1])] - - result_data = { - "datetime": datetime_col.to_list(), - "instrument": instrument_col.to_list(), - **{col_name: embeddings[:, i].tolist() for i, col_name in enumerate(embedding_cols)} - } - - df_result = pl.DataFrame(result_data) - - # Ensure output directory exists - output_path = Path(output_file) - output_path.parent.mkdir(parents=True, exist_ok=True) - - # Save to parquet - df_result.write_parquet(output_path) - print(f"\nEmbeddings saved to: {output_path}") - print(f"Output shape: {df_result.shape}") - print(f"\nSample output:") - print(df_result.head(10)) - - # Print summary statistics - print("\n" + "=" * 60) - print("Summary Statistics") - print("=" * 60) - print(f"Total samples: {len(df_result)}") - print(f"Embedding dimension: {embeddings.shape[1]}") - print(f"Date range: {df_result['datetime'].min()} to {df_result['datetime'].max()}") - print(f"Instruments: {df_result['instrument'].n_unique()}") - print(f"Embedding mean: {np.mean(embeddings):.6f}") - print(f"Embedding std: {np.std(embeddings):.6f}") - print(f"Embedding min: {np.min(embeddings):.6f}") - print(f"Embedding max: {np.max(embeddings):.6f}") - - return df_result - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Generate sample beta embeddings") - parser.add_argument("--dates", nargs="+", default=SAMPLE_DATES, - help="List of dates (YYYY-MM-DD)") - parser.add_argument("--output", type=str, default="embedding_0_7_beta_sample.parquet", - help="Output parquet file") - parser.add_argument("--no-vae", action="store_true", - help="Skip VAE encoding (use random embeddings)") - - args = parser.parse_args() - - generate_sample_embeddings( - dates=args.dates, - output_file=args.output, - use_vae=not args.no_vae - ) - - print("\nDone!") diff --git a/stock_1d/d033/alpha158_beta/scripts/run.log b/stock_1d/d033/alpha158_beta/scripts/run.log deleted file mode 100644 index 8d58b7c..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/run.log +++ /dev/null @@ -1,394 +0,0 @@ -[2715583:MainThread](2026-02-26 19:58:16,674) INFO - qlib.Initialization - [config.py:413] - default_conf: client. -[2715583:MainThread](2026-02-26 19:58:16,680) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings. -[2715583:MainThread](2026-02-26 19:58:16,681) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')} -================================================================================ -DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE -================================================================================ -Date Range: 2020-01-02 to 2020-01-10 -Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data -Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/ - -Step 1: Loading data from Qlib pipeline... - Loading since_date=2020-01-02 -Will use `placehorder_value` from module: qlib.contrib.data.config -Will init handler object from config: -{'data_handler_config': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'end_time': datetime.date(2026, 2, 26), - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-12-03 00:00:00')}, - 'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'handler': {'class': 'AggHandler', - 'kwargs': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'end_time': datetime.date(2026, 2, 26)[2715583:MainThread](2026-02-26 19:58:16,707) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler -[2715583:MainThread](2026-02-26 19:58:16,707) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored -[2715583:MainThread](2026-02-26 19:58:17,067) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX -[2715583:MainThread](2026-02-26 20:05:39,665) INFO - qlib.timer - [log.py:117] - Time cost: 442.946s | DDB query: Done -[2715583:MainThread](2026-02-26 20:05:40,469) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -, - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-12-03 00:00:00')}, - 'module_path': 'qlib.contrib.data.agg_handler'}, - 'load_end': datetime.date(2026, 2, 26), - 'load_start': Timestamp('2019-12-03 00:00:00'), - 'market': 'csiallx', - 'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target', - 'region': 'cn'}} -Query config: -#alpha158: 1; -Will use float32 for -[2715583:MainThread](2026-02-26 20:07:46,118) INFO - qlib.timer - [log.py:117] - Time cost: 115.964s | Instruments filter: Done -[2715583:MainThread](2026-02-26 20:07:53,273) INFO - qlib.timer - [log.py:117] - Time cost: 576.561s | Loading data () Done -[2715583:MainThread](2026-02-26 20:07:53,274) INFO - qlib.timer - [log.py:117] - Time cost: 576.562s | Init data () Done -[2715583:MainThread](2026-02-26 20:07:53,276) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2715583:MainThread](2026-02-26 20:07:56,700) INFO - qlib.timer - [log.py:117] - Time cost: 3.423s | fetch_df_by_index Done -[2715583:MainThread](2026-02-26 20:07:58,185) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None) - KMID KLEN ... VSUMD30 VSUMD60 -datetime instrument ... -2019-12-03 SH600000 0.004234 0.011008 ... -0.031454 -0.009671 - SH600004 0.015467 0.031529 ... -0.004401 0.007701 - SH600006 0.022573 0.033860 ... 0.060561 -0.000159 - SH600007 0.012129 0.025470 ... 0.008489 -0.054056 - SH600008 0.006173 0.009259 ... -0.088065 -0.080770 -... ... ... ... ... ... -2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708 - SZ301662 0.060584 0.087834 ... -0.014658 -0.014613 - SZ301665 -0.012899 0.040541 ... 0.083229 0.055994 - SZ301678 0.018182 0.027879 ... -0.054124 0.014202 - SZ302132 0.001754 0.016416 ... -0.049558 -0.038667 - -[6886779 rows x 158 columns] -[2715583:MainThread](2026-02-26 20:07:58,186) INFO - qlib.timer - [log.py:117] - Time cost: 4.911s | Fetching dataframe Done -[2715583:MainThread](2026-02-26 20:07:58,203) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from - loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted") - where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3 -[2715583:MainThread](2026-02-26 20:08:15,182) INFO - qlib.timer - [log.py:117] - Time cost: 16.990s | DDB query: Done -[2715583:MainThread](2026-02-26 20:08:15,974) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2715583:MainThread](2026-02-26 20:08:16,548) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,con_rating_strength from - loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating") - where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH -[2715583:MainThread](2026-02-26 20:08:27,838) INFO - qlib.timer - [log.py:117] - Time cost: 11.299s | DDB query: Done -[2715583:MainThread](2026-02-26 20:08:28,690) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00 -[2715583:MainThread](2026-02-26 20:09:53,616) INFO - qlib.timer - [log.py:117] - Time cost: 81.815s | Instruments filter: Done -[2715583:MainThread](2026-02-26 20:09:54,168) INFO - qlib.timer - [log.py:117] - Time cost: 115.981s | Loading data () Done -[2715583:MainThread](2026-02-26 20:09:54,169) INFO - qlib.timer - [log.py:117] - Time cost: 115.982s | Init data () Done -[2715583:MainThread](2026-02-26 20:09:54,170) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2715583:MainThread](2026-02-26 20:09:54,893) INFO - qlib.timer - [log.py:117] - Time cost: 0.723s | fetch_df_by_index Done -[2715583:MainThread](2026-02-26 20:09:54,901) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None) - turnover free_turnover log_size con_rating_strength -datetime instrument -2019-12-03 SH600000 0.0696 0.1275 17.322001 0.6618 - SH600004 0.6009 1.2276 15.077468 0.8269 - SH600006 0.5976 1.5087 13.716795 1.0000 - SH600007 0.0961 0.4969 14.334991 0.7500 - SH600008 0.0967 0.1793 14.432563 0.6591 -... ... ... ... ... -2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN - SZ301662 12.5950 12.5950 12.681215 NaN - SZ301665 14.0077 14.0077 11.719415 NaN - SZ301678 6.6518 6.6518 12.799973 NaN - SZ302132 1.3868 3.0296 15.359885 NaN - -[7601552 rows x 4 columns] -[2715583:MainThread](2026-02-26 20:09:54,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.732s | Fetching dataframe Done -[2715583:MainThread](2026-02-26 20:09:54,917) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from - loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted") - where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657', -[2715583:MainThread](2026-02-26 20:10:15,465) INFO - qlib.timer - [log.py:117] - Time cost: 20.556s | DDB query: Done -[2715583:MainThread](2026-02-26 20:10:16,265) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2715583:MainThread](2026-02-26 20:10:16,775) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from - loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag") - where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281',' -[2715583:MainThread](2026-02-26 20:10:36,740) INFO - qlib.timer - [log.py:117] - Time cost: 19.975s | DDB query: Done -[2715583:MainThread](2026-02-26 20:10:37,558) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2715583:MainThread](2026-02-26 20:12:04,978) INFO - qlib.timer - [log.py:117] - Time cost: 84.148s | Instruments filter: Done -[2715583:MainThread](2026-02-26 20:12:05,899) INFO - qlib.timer - [log.py:117] - Time cost: 130.996s | Loading data () Done -[2715583:MainThread](2026-02-26 20:12:05,900) INFO - qlib.timer - [log.py:117] - Time cost: 130.997s | Init data () Done -[2715583:MainThread](2026-02-26 20:12:05,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2715583:MainThread](2026-02-26 20:12:06,745) INFO - qlib.timer - [log.py:117] - Time cost: 0.842s | fetch_df_by_index Done -[2715583:MainThread](2026-02-26 20:12:06,758) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None) - IsZt IsDt IsN ... open_stop close_stop high_stop -datetime instrument ... -2019-12-03 SH600000 False False False ... False False False - SH600004 False False False ... False False False - SH600006 False False False ... False False False - SH600007 False False False ... False False False - SH600008 False False False ... False False False -... ... ... ... ... ... ... ... -2026-02-26 SZ301658 False False False ... False False False - SZ301662 False False False ... False False False - SZ301665 False False False ... False False False - SZ301678 False False False ... False False False - SZ302132 False False False ... False False False - -[6903684 rows x 12 columns] -[2715583:MainThread](2026-02-26 20:12:06,759) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done -[2715583:MainThread](2026-02-26 20:12:06,777) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from - loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1") - where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S -[2715583:MainThread](2026-02-26 20:12:08,840) INFO - qlib.timer - [log.py:117] - Time cost: 2.073s | DDB query: Done -[2715583:MainThread](2026-02-26 20:12:08,849) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2715583:MainThread](2026-02-26 20:13:26,572) INFO - qlib.timer - [log.py:117] - Time cost: 77.719s | Instruments filter: Done -[2715583:MainThread](2026-02-26 20:13:26,601) INFO - qlib.timer - [log.py:117] - Time cost: 79.839s | Loading data () Done -[2715583:MainThread](2026-02-26 20:13:26,602) INFO - qlib.timer - [log.py:117] - Time cost: 79.840s | Init data () Done -[2715583:MainThread](2026-02-26 20:13:26,603) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2715583:MainThread](2026-02-26 20:13:26,612) INFO - qlib.timer - [log.py:117] - Time cost: 0.008s | fetch_df_by_index Done -[2715583:MainThread](2026-02-26 20:13:26,633) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None) - gds_CC10 gds_CC11 ... gds_CC63 gds_CC70 -datetime instrument ... -2026-02-09 SH600000 False False ... False False - SH600004 False False ... False False - SH600006 False False ... False False - SH600007 False False ... False False - SH600008 False False ... False False -... ... ... ... ... ... -2026-02-26 SZ301658 False False ... False False - SZ301662 False False ... False False - SZ301665 False False ... False False - SZ301678 False False ... False False - SZ302132 False False ... False False - -[41168 rows x 30 columns] -[2715583:MainThread](2026-02-26 20:13:26,634) INFO - qlib.timer - [log.py:117] - Time cost: 0.031s | Fetching dataframe Done -[2715583:MainThread](2026-02-26 20:13:26,652) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from - loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag") - where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002 -[2715583:MainThread](2026-02-26 20:13:55,744) INFO - qlib.timer - [log.py:117] - Time cost: 29.102s | DDB query: Done -[2715583:MainThread](2026-02-26 20:13:56,520) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2715583:MainThread](2026-02-26 20:15:27,625) INFO - qlib.timer - [log.py:117] - Time cost: 90.586s | Instruments filter: Done -[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.621s | Loading data () Done -[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.622s | Init data () Done -[2715583:MainThread](2026-02-26 20:15:28,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2715583:MainThread](2026-02-26 20:15:28,867) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done -[2715583:MainThread](2026-02-26 20:15:28,875) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None) - ST_Y ST_S ST_T ST_L ST_Z ST_X -datetime instrument -2019-12-03 SH600000 False False False False False False - SH600004 False False False False False False - SH600006 False False False False False False - SH600007 False False False False False False - SH600008 False False False False False False -... ... ... ... ... ... ... -2026-02-26 SZ301658 False False False False False False - SZ301662 False False False False False False - SZ301665 False False False False False False - SZ301678 False False False False False False - SZ302132 False False False False False False - -[6903687 rows x 6 columns] -[2715583:MainThread](2026-02-26 20:15:28,876) INFO - qlib.timer - [log.py:117] - Time cost: 0.617s | Fetching dataframe Done -/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead. - group_list = [_df.resample("M", level="datetime")\ -Will use float32 for -Will use float32 for -Query config: -#concepts: 2; -Will use bool for -Will use bool for -Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70'] -Will use bool for -Will use bool for -[2715583:MainThread](2026-02-26 20:15:32,735) INFO - qlib.timer - [log.py:117] - Time cost: 3.858s | Concat index: Done -[2715583:MainThread](2026-02-26 20:15:32,737) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done -[2715583:MainThread](2026-02-26 20:15:36,349) INFO - qlib.timer - [log.py:117] - Time cost: 3.611s | Creating SepDataFrame: Done -[2715583:MainThread](2026-02-26 20:15:37,245) INFO - qlib.timer - [log.py:117] - Time cost: 1040.537s | Loading data () Done -[2715583:MainThread](2026-02-26 20:15:37,246) INFO - qlib.AggHandler - [handler.py:468] - Read-only True -[] -[2715583:MainThread](2026-02-26 20:15:37,248) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame. -[2715583:MainThread](2026-02-26 20:15:37,265) INFO - qlib.AggHandler - [handler.py:468] - Read-only True -[] -[2715583:MainThread](2026-02-26 20:15:37,266) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame. -[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:468] - Read-only True -[] -[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame. -[2715583:MainThread](2026-02-26 20:15:37,293) INFO - qlib.timer - [log.py:117] - Time cost: 0.047s | fit & process data Done -[2715583:MainThread](2026-02-26 20:15:37,294) INFO - qlib.timer - [log.py:117] - Time cost: 1040.587s | Init data () Done -[2715583:MainThread](2026-02-26 20:15:37,963) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor -[2715583:MainThread](2026-02-26 20:15:40,135) INFO - qlib.timer - [log.py:117] - Time cost: 2.171s | Diff Done -[2715583:MainThread](2026-02-26 20:15:40,136) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor -All processors are readonly -All processors are readonly -All processors are readonly -Did load data from config: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml -Did load norm from: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc -Will assign `feature_ext` with - turnover ... con_rating_strength_diff -datetime instrument ... -2026-02-09 SH600000 0.1837 ... 0.0 - SH600004 0.6948 ... 0.0 - SH600006 0.5542 ... 0.0 - SH600007 0.2057 ... 0.0 - SH600008 0.9809 ... 0.0 -... ... ... ... -2026-02-26 SZ301658 6.0785 ... 0.0 - SZ301662 12.5950 ... 0.0 - SZ301665 14.0077 ... 0.0 - SZ301678 6.6518 ... 0.0 - SZ302132 1.3868 ... 0.0 - -[41085 rows x 8 columns] ---- - ERROR: Failed to load data from Qlib pipeline: Cannot convert non-finite values (NA or inf) to integer diff --git a/stock_1d/d033/alpha158_beta/scripts/run2.log b/stock_1d/d033/alpha158_beta/scripts/run2.log deleted file mode 100644 index dd3e579..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/run2.log +++ /dev/null @@ -1,373 +0,0 @@ -[2730312:MainThread](2026-02-26 21:28:33,675) INFO - qlib.Initialization - [config.py:413] - default_conf: client. -[2730312:MainThread](2026-02-26 21:28:33,679) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings. -[2730312:MainThread](2026-02-26 21:28:33,680) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')} -================================================================================ -DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE -================================================================================ -Date Range: 2020-01-02 to 2020-01-10 -Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data -Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/ - -Step 1: Loading data from Qlib pipeline... - Loading since_date=2020-01-02 - Loading raw data from handler.yaml... -Will use `placehorder_value` from module: qlib.contrib.data.config -Will init handler object from config: -{'data_handler_config': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'end_time': datetime.date(2026, 2, 26), - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-12-13 00:00:00')}, - 'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'handler': {'class': 'AggHandler', - 'kwargs': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'end_time': [2730312:MainThread](2026-02-26 21:28:33,704) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler -[2730312:MainThread](2026-02-26 21:28:33,704) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored -[2730312:MainThread](2026-02-26 21:28:34,011) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX -[2730312:MainThread](2026-02-26 21:36:00,317) INFO - qlib.timer - [log.py:117] - Time cost: 446.602s | DDB query: Done -[2730312:MainThread](2026-02-26 21:36:01,106) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -datetime.date(2026, 2, 26), - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-12-13 00:00:00')}, - 'module_path': 'qlib.contrib.data.agg_handler'}, - 'load_end': datetime.date(2026, 2, 26), - 'load_start': Timestamp('2019-12-13 00:00:00'), - 'market': 'csiallx', - 'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target', - 'region': 'cn'}} -Query config: -#alpha158: 1; -Will use float32 for -[2730312:MainThread](2026-02-26 21:38:13,636) INFO - qlib.timer - [log.py:117] - Time cost: 123.423s | Instruments filter: Done -[2730312:MainThread](2026-02-26 21:38:20,733) INFO - qlib.timer - [log.py:117] - Time cost: 587.024s | Loading data () Done -[2730312:MainThread](2026-02-26 21:38:20,734) INFO - qlib.timer - [log.py:117] - Time cost: 587.026s | Init data () Done -[2730312:MainThread](2026-02-26 21:38:20,736) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2730312:MainThread](2026-02-26 21:38:24,302) INFO - qlib.timer - [log.py:117] - Time cost: 3.564s | fetch_df_by_index Done -[2730312:MainThread](2026-02-26 21:38:25,946) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None) - KMID KLEN ... VSUMD30 VSUMD60 -datetime instrument ... -2019-12-13 SH600000 0.011686 0.015025 ... -0.011573 0.039735 - SH600004 0.000000 0.009169 ... -0.146051 0.024757 - SH600006 -0.004329 0.015152 ... 0.136883 0.024626 - SH600007 0.005590 0.019005 ... -0.012912 0.017215 - SH600008 0.012270 0.012270 ... 0.039878 -0.013888 -... ... ... ... ... ... -2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708 - SZ301662 0.060584 0.087834 ... -0.014658 -0.014613 - SZ301665 -0.012899 0.040541 ... 0.083229 0.055994 - SZ301678 0.018182 0.027879 ... -0.054124 0.014202 - SZ302132 0.001754 0.016416 ... -0.049558 -0.038667 - -[6858048 rows x 158 columns] -[2730312:MainThread](2026-02-26 21:38:25,947) INFO - qlib.timer - [log.py:117] - Time cost: 5.212s | Fetching dataframe Done -[2730312:MainThread](2026-02-26 21:38:25,965) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from - loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted") - where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3 -[2730312:MainThread](2026-02-26 21:38:43,081) INFO - qlib.timer - [log.py:117] - Time cost: 17.127s | DDB query: Done -[2730312:MainThread](2026-02-26 21:38:43,874) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2730312:MainThread](2026-02-26 21:38:44,458) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,con_rating_strength from - loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating") - where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH -[2730312:MainThread](2026-02-26 21:38:55,720) INFO - qlib.timer - [log.py:117] - Time cost: 11.271s | DDB query: Done -[2730312:MainThread](2026-02-26 21:38:56,586) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00 -[2730312:MainThread](2026-02-26 21:40:21,007) INFO - qlib.timer - [log.py:117] - Time cost: 81.315s | Instruments filter: Done -[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.627s | Loading data () Done -[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.628s | Init data () Done -[2730312:MainThread](2026-02-26 21:40:21,577) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2730312:MainThread](2026-02-26 21:40:22,309) INFO - qlib.timer - [log.py:117] - Time cost: 0.731s | fetch_df_by_index Done -[2730312:MainThread](2026-02-26 21:40:22,317) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None) - turnover free_turnover log_size con_rating_strength -datetime instrument -2019-12-13 SH600000 0.2118 0.3879 17.343685 0.7143 - SH600004 0.7518 1.5357 15.099485 0.8214 - SH600006 0.7827 1.9762 13.732129 1.0000 - SH600007 0.1368 0.7071 14.409998 0.7500 - SH600008 0.2152 0.3990 14.444757 0.7500 -... ... ... ... ... -2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN - SZ301662 12.5950 12.5950 12.681215 NaN - SZ301665 14.0077 14.0077 11.719415 NaN - SZ301678 6.6518 6.6518 12.799973 NaN - SZ302132 1.3868 3.0296 15.359885 NaN - -[7572626 rows x 4 columns] -[2730312:MainThread](2026-02-26 21:40:22,318) INFO - qlib.timer - [log.py:117] - Time cost: 0.741s | Fetching dataframe Done -[2730312:MainThread](2026-02-26 21:40:22,334) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from - loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted") - where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657', -[2730312:MainThread](2026-02-26 21:40:43,075) INFO - qlib.timer - [log.py:117] - Time cost: 20.751s | DDB query: Done -[2730312:MainThread](2026-02-26 21:40:43,889) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2730312:MainThread](2026-02-26 21:40:44,394) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from - loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag") - where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281',' -[2730312:MainThread](2026-02-26 21:41:04,632) INFO - qlib.timer - [log.py:117] - Time cost: 20.246s | DDB query: Done -[2730312:MainThread](2026-02-26 21:41:05,434) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2730312:MainThread](2026-02-26 21:42:33,029) INFO - qlib.timer - [log.py:117] - Time cost: 84.294s | Instruments filter: Done -[2730312:MainThread](2026-02-26 21:42:34,049) INFO - qlib.timer - [log.py:117] - Time cost: 131.730s | Loading data () Done -[2730312:MainThread](2026-02-26 21:42:34,050) INFO - qlib.timer - [log.py:117] - Time cost: 131.731s | Init data () Done -[2730312:MainThread](2026-02-26 21:42:34,051) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2730312:MainThread](2026-02-26 21:42:34,895) INFO - qlib.timer - [log.py:117] - Time cost: 0.843s | fetch_df_by_index Done -[2730312:MainThread](2026-02-26 21:42:34,907) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None) - IsZt IsDt IsN ... open_stop close_stop high_stop -datetime instrument ... -2019-12-13 SH600000 False False False ... False False False - SH600004 False False False ... False False False - SH600006 False False False ... False False False - SH600007 False False False ... False False False - SH600008 False False False ... False False False -... ... ... ... ... ... ... ... -2026-02-26 SZ301658 False False False ... False False False - SZ301662 False False False ... False False False - SZ301665 False False False ... False False False - SZ301678 False False False ... False False False - SZ302132 False False False ... False False False - -[6874830 rows x 12 columns] -[2730312:MainThread](2026-02-26 21:42:34,908) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done -[2730312:MainThread](2026-02-26 21:42:34,927) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from - loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1") - where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S -[2730312:MainThread](2026-02-26 21:42:36,986) INFO - qlib.timer - [log.py:117] - Time cost: 2.069s | DDB query: Done -[2730312:MainThread](2026-02-26 21:42:36,996) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2730312:MainThread](2026-02-26 21:43:53,198) INFO - qlib.timer - [log.py:117] - Time cost: 76.199s | Instruments filter: Done -[2730312:MainThread](2026-02-26 21:43:53,230) INFO - qlib.timer - [log.py:117] - Time cost: 78.318s | Loading data () Done -[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 78.319s | Init data () Done -[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2730312:MainThread](2026-02-26 21:43:53,239) INFO - qlib.timer - [log.py:117] - Time cost: 0.007s | fetch_df_by_index Done -[2730312:MainThread](2026-02-26 21:43:53,257) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None) - gds_CC10 gds_CC11 ... gds_CC63 gds_CC70 -datetime instrument ... -2026-02-09 SH600000 False False ... False False - SH600004 False False ... False False - SH600006 False False ... False False - SH600007 False False ... False False - SH600008 False False ... False False -... ... ... ... ... ... -2026-02-26 SZ301658 False False ... False False - SZ301662 False False ... False False - SZ301665 False False ... False False - SZ301678 False False ... False False - SZ302132 False False ... False False - -[41168 rows x 30 columns] -[2730312:MainThread](2026-02-26 21:43:53,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.027s | Fetching dataframe Done -[2730312:MainThread](2026-02-26 21:43:53,274) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from - loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag") - where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002 -[2730312:MainThread](2026-02-26 21:44:44,876) INFO - qlib.timer - [log.py:117] - Time cost: 51.611s | DDB query: Done -[2730312:MainThread](2026-02-26 21:44:45,602) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2730312:MainThread](2026-02-26 21:46:07,184) INFO - qlib.timer - [log.py:117] - Time cost: 81.056s | Instruments filter: Done -[2730312:MainThread](2026-02-26 21:46:07,747) INFO - qlib.timer - [log.py:117] - Time cost: 134.487s | Loading data () Done -[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 134.488s | Init data () Done -[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2730312:MainThread](2026-02-26 21:46:08,349) INFO - qlib.timer - [log.py:117] - Time cost: 0.600s | fetch_df_by_index Done -[2730312:MainThread](2026-02-26 21:46:08,358) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None) - ST_Y ST_S ST_T ST_L ST_Z ST_X -datetime instrument -2019-12-13 SH600000 False False False False False False - SH600004 False False False False False False - SH600006 False False False False False False - SH600007 False False False False False False - SH600008 False False False False False False -... ... ... ... ... ... ... -2026-02-26 SZ301658 False False False False False False - SZ301662 False False False False False False - SZ301665 False False False False False False - SZ301678 False False False False False False - SZ302132 False False False False False False - -[6874833 rows x 6 columns] -[2730312:MainThread](2026-02-26 21:46:08,359) INFO - qlib.timer - [log.py:117] - Time cost: 0.610s | Fetching dataframe Done -/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead. - group_list = [_df.resample("M", level="datetime")\ -Will use float32 for -Will use float32 for -Query config: -#concepts: 2; -Will use bool for -Will use bool for -Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70'] -Will use bool for -Will use bool for -[2730312:MainThread](2026-02-26 21:46:11,623) INFO - qlib.timer - [log.py:117] - Time cost: 3.264s | Concat index: Done -[2730312:MainThread](2026-02-26 21:46:11,625) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done -[2730312:MainThread](2026-02-26 21:46:15,058) INFO - qlib.timer - [log.py:117] - Time cost: 3.433s | Creating SepDataFrame: Done -[2730312:MainThread](2026-02-26 21:46:15,928) INFO - qlib.timer - [log.py:117] - Time cost: 1062.224s | Loading data () Done -[2730312:MainThread](2026-02-26 21:46:15,929) INFO - qlib.AggHandler - [handler.py:468] - Read-only True -[] -[2730312:MainThread](2026-02-26 21:46:15,931) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame. -[2730312:MainThread](2026-02-26 21:46:15,935) INFO - qlib.AggHandler - [handler.py:468] - Read-only True -[] -[2730312:MainThread](2026-02-26 21:46:15,936) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame. -[2730312:MainThread](2026-02-26 21:46:15,939) INFO - qlib.AggHandler - [handler.py:468] - Read-only True -[] -[2730312:MainThread](2026-02-26 21:46:15,940) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame. -[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 0.014s | fit & process data Done -[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 1062.239s | Init data () Done -All processors are readonly -All processors are readonly -All processors are readonly - ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape' diff --git a/stock_1d/d033/alpha158_beta/scripts/run3.log b/stock_1d/d033/alpha158_beta/scripts/run3.log deleted file mode 100644 index 0745991..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/run3.log +++ /dev/null @@ -1,373 +0,0 @@ -[2734404:MainThread](2026-02-26 22:10:11,609) INFO - qlib.Initialization - [config.py:413] - default_conf: client. -[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings. -[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')} -================================================================================ -DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE -================================================================================ -Date Range: 2020-01-02 to 2020-01-10 -Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data -Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/ - -Step 1: Loading data from Qlib pipeline... - Loading since_date=2020-01-02 - Loading data with handler (load_start=2019-12-13)... -Will use `placehorder_value` from module: qlib.contrib.data.config -Will init handler object from config: -{'data_handler_config': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'end_time': datetime.date(2026, 2, 26), - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-11-23 00:00:00')}, - 'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'handler': {'class': 'AggHandler', - 'kwargs': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - [2734404:MainThread](2026-02-26 22:10:11,634) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler -[2734404:MainThread](2026-02-26 22:10:11,634) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored -[2734404:MainThread](2026-02-26 22:10:11,842) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX -[2734404:MainThread](2026-02-26 22:17:41,432) INFO - qlib.timer - [log.py:117] - Time cost: 449.788s | DDB query: Done -[2734404:MainThread](2026-02-26 22:17:42,271) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -'end_time': datetime.date(2026, 2, 26), - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-11-23 00:00:00')}, - 'module_path': 'qlib.contrib.data.agg_handler'}, - 'load_end': datetime.date(2026, 2, 26), - 'load_start': Timestamp('2019-11-23 00:00:00'), - 'market': 'csiallx', - 'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target', - 'region': 'cn'}} -Query config: -#alpha158: 1; -Will use float32 for -[2734404:MainThread](2026-02-26 22:19:46,550) INFO - qlib.timer - [log.py:117] - Time cost: 115.118s | Instruments filter: Done -[2734404:MainThread](2026-02-26 22:19:53,556) INFO - qlib.timer - [log.py:117] - Time cost: 581.918s | Loading data () Done -[2734404:MainThread](2026-02-26 22:19:53,557) INFO - qlib.timer - [log.py:117] - Time cost: 581.920s | Init data () Done -[2734404:MainThread](2026-02-26 22:19:53,560) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2734404:MainThread](2026-02-26 22:19:57,060) INFO - qlib.timer - [log.py:117] - Time cost: 3.499s | fetch_df_by_index Done -[2734404:MainThread](2026-02-26 22:19:58,834) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None) - KMID KLEN ... VSUMD30 VSUMD60 -datetime instrument ... -2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125 - SH600004 -0.013806 0.030012 ... -0.017610 0.039195 - SH600006 0.009238 0.016166 ... -0.034782 -0.014306 - SH600007 -0.014749 0.018879 ... -0.032427 0.034279 - SH600008 0.009259 0.024691 ... -0.063490 0.003978 -... ... ... ... ... ... -2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708 - SZ301662 0.060584 0.087834 ... -0.014658 -0.014613 - SZ301665 -0.012899 0.040541 ... 0.083229 0.055994 - SZ301678 0.018182 0.027879 ... -0.054124 0.014202 - SZ302132 0.001754 0.016416 ... -0.049558 -0.038667 - -[6908346 rows x 158 columns] -[2734404:MainThread](2026-02-26 22:19:58,835) INFO - qlib.timer - [log.py:117] - Time cost: 5.276s | Fetching dataframe Done -[2734404:MainThread](2026-02-26 22:19:59,042) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from - loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3 -[2734404:MainThread](2026-02-26 22:20:16,326) INFO - qlib.timer - [log.py:117] - Time cost: 17.485s | DDB query: Done -[2734404:MainThread](2026-02-26 22:20:17,102) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2734404:MainThread](2026-02-26 22:20:17,676) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,con_rating_strength from - loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH -[2734404:MainThread](2026-02-26 22:20:29,343) INFO - qlib.timer - [log.py:117] - Time cost: 11.676s | DDB query: Done -[2734404:MainThread](2026-02-26 22:20:30,245) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00 -[2734404:MainThread](2026-02-26 22:21:55,033) INFO - qlib.timer - [log.py:117] - Time cost: 81.592s | Instruments filter: Done -[2734404:MainThread](2026-02-26 22:21:55,586) INFO - qlib.timer - [log.py:117] - Time cost: 116.751s | Loading data () Done -[2734404:MainThread](2026-02-26 22:21:55,587) INFO - qlib.timer - [log.py:117] - Time cost: 116.752s | Init data () Done -[2734404:MainThread](2026-02-26 22:21:55,588) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2734404:MainThread](2026-02-26 22:21:56,302) INFO - qlib.timer - [log.py:117] - Time cost: 0.713s | fetch_df_by_index Done -[2734404:MainThread](2026-02-26 22:21:56,309) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None) - turnover free_turnover log_size con_rating_strength -datetime instrument -2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214 - SH600004 0.9386 1.9173 15.039255 0.8125 - SH600006 0.2566 0.6479 13.680836 1.0000 - SH600007 0.1647 0.8513 14.335590 0.7500 - SH600008 0.1813 0.3362 14.435625 0.6875 -... ... ... ... ... -2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN - SZ301662 12.5950 12.5950 12.681215 NaN - SZ301665 14.0077 14.0077 11.719415 NaN - SZ301678 6.6518 6.6518 12.799973 NaN - SZ302132 1.3868 3.0296 15.359885 NaN - -[7623242 rows x 4 columns] -[2734404:MainThread](2026-02-26 22:21:56,310) INFO - qlib.timer - [log.py:117] - Time cost: 0.722s | Fetching dataframe Done -[2734404:MainThread](2026-02-26 22:21:56,327) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from - loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657', -[2734404:MainThread](2026-02-26 22:22:17,215) INFO - qlib.timer - [log.py:117] - Time cost: 20.899s | DDB query: Done -[2734404:MainThread](2026-02-26 22:22:17,952) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2734404:MainThread](2026-02-26 22:22:18,463) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from - loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281',' -[2734404:MainThread](2026-02-26 22:22:38,963) INFO - qlib.timer - [log.py:117] - Time cost: 20.509s | DDB query: Done -[2734404:MainThread](2026-02-26 22:22:39,774) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2734404:MainThread](2026-02-26 22:24:07,744) INFO - qlib.timer - [log.py:117] - Time cost: 84.654s | Instruments filter: Done -[2734404:MainThread](2026-02-26 22:24:08,702) INFO - qlib.timer - [log.py:117] - Time cost: 132.391s | Loading data () Done -[2734404:MainThread](2026-02-26 22:24:08,703) INFO - qlib.timer - [log.py:117] - Time cost: 132.392s | Init data () Done -[2734404:MainThread](2026-02-26 22:24:08,704) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2734404:MainThread](2026-02-26 22:24:09,549) INFO - qlib.timer - [log.py:117] - Time cost: 0.844s | fetch_df_by_index Done -[2734404:MainThread](2026-02-26 22:24:09,561) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None) - IsZt IsDt IsN ... open_stop close_stop high_stop -datetime instrument ... -2019-11-25 SH600000 False False False ... False False False - SH600004 False False False ... False False False - SH600006 False False False ... False False False - SH600007 False False False ... False False False - SH600008 False False False ... False False False -... ... ... ... ... ... ... ... -2026-02-26 SZ301658 False False False ... False False False - SZ301662 False False False ... False False False - SZ301665 False False False ... False False False - SZ301678 False False False ... False False False - SZ302132 False False False ... False False False - -[6925320 rows x 12 columns] -[2734404:MainThread](2026-02-26 22:24:09,562) INFO - qlib.timer - [log.py:117] - Time cost: 0.858s | Fetching dataframe Done -[2734404:MainThread](2026-02-26 22:24:09,760) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from - loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S -[2734404:MainThread](2026-02-26 22:24:11,809) INFO - qlib.timer - [log.py:117] - Time cost: 2.238s | DDB query: Done -[2734404:MainThread](2026-02-26 22:24:11,822) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2734404:MainThread](2026-02-26 22:25:28,259) INFO - qlib.timer - [log.py:117] - Time cost: 76.433s | Instruments filter: Done -[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Loading data () Done -[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Init data () Done -[2734404:MainThread](2026-02-26 22:25:28,286) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2734404:MainThread](2026-02-26 22:25:28,290) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done -[2734404:MainThread](2026-02-26 22:25:28,310) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None) - gds_CC10 gds_CC11 ... gds_CC63 gds_CC70 -datetime instrument ... -2026-02-09 SH600000 False False ... False False - SH600004 False False ... False False - SH600006 False False ... False False - SH600007 False False ... False False - SH600008 False False ... False False -... ... ... ... ... ... -2026-02-26 SZ301658 False False ... False False - SZ301662 False False ... False False - SZ301665 False False ... False False - SZ301678 False False ... False False - SZ302132 False False ... False False - -[41168 rows x 30 columns] -[2734404:MainThread](2026-02-26 22:25:28,311) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done -[2734404:MainThread](2026-02-26 22:25:28,470) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from - loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002 -[2734404:MainThread](2026-02-26 22:25:58,108) INFO - qlib.timer - [log.py:117] - Time cost: 29.791s | DDB query: Done -[2734404:MainThread](2026-02-26 22:25:58,818) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2734404:MainThread](2026-02-26 22:27:21,291) INFO - qlib.timer - [log.py:117] - Time cost: 81.957s | Instruments filter: Done -[2734404:MainThread](2026-02-26 22:27:21,828) INFO - qlib.timer - [log.py:117] - Time cost: 113.516s | Loading data () Done -[2734404:MainThread](2026-02-26 22:27:21,829) INFO - qlib.timer - [log.py:117] - Time cost: 113.517s | Init data () Done -[2734404:MainThread](2026-02-26 22:27:21,830) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2734404:MainThread](2026-02-26 22:27:22,439) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done -[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None) - ST_Y ST_S ST_T ST_L ST_Z ST_X -datetime instrument -2019-11-25 SH600000 False False False False False False - SH600004 False False False False False False - SH600006 False False False False False False - SH600007 False False False False False False - SH600008 False False False False False False -... ... ... ... ... ... ... -2026-02-26 SZ301658 False False False False False False - SZ301662 False False False False False False - SZ301665 False False False False False False - SZ301678 False False False False False False - SZ302132 False False False False False False - -[6925323 rows x 6 columns] -[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.timer - [log.py:117] - Time cost: 0.618s | Fetching dataframe Done -/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead. - group_list = [_df.resample("M", level="datetime")\ -Will use float32 for -Will use float32 for -Query config: -#concepts: 2; -Will use bool for -Will use bool for -Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70'] -Will use bool for -Will use bool for -[2734404:MainThread](2026-02-26 22:27:25,764) INFO - qlib.timer - [log.py:117] - Time cost: 3.315s | Concat index: Done -[2734404:MainThread](2026-02-26 22:27:25,766) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done -[2734404:MainThread](2026-02-26 22:27:29,485) INFO - qlib.timer - [log.py:117] - Time cost: 3.718s | Creating SepDataFrame: Done -[2734404:MainThread](2026-02-26 22:27:30,310) INFO - qlib.timer - [log.py:117] - Time cost: 1038.675s | Loading data () Done -[2734404:MainThread](2026-02-26 22:27:30,311) INFO - qlib.AggHandler - [handler.py:468] - Read-only True -[] -[2734404:MainThread](2026-02-26 22:27:30,313) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame. -[2734404:MainThread](2026-02-26 22:27:30,318) INFO - qlib.AggHandler - [handler.py:468] - Read-only True -[] -[2734404:MainThread](2026-02-26 22:27:30,319) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame. -[2734404:MainThread](2026-02-26 22:27:30,322) INFO - qlib.AggHandler - [handler.py:468] - Read-only True -[] -[2734404:MainThread](2026-02-26 22:27:30,323) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame. -[2734404:MainThread](2026-02-26 22:27:30,326) INFO - qlib.timer - [log.py:117] - Time cost: 0.015s | fit & process data Done -[2734404:MainThread](2026-02-26 22:27:30,327) INFO - qlib.timer - [log.py:117] - Time cost: 1038.692s | Init data () Done -All processors are readonly -All processors are readonly -All processors are readonly - ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape' diff --git a/stock_1d/d033/alpha158_beta/scripts/run4.log b/stock_1d/d033/alpha158_beta/scripts/run4.log deleted file mode 100644 index 42eef2a..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/run4.log +++ /dev/null @@ -1,321 +0,0 @@ -[2739486:MainThread](2026-02-26 22:59:30,849) INFO - qlib.Initialization - [config.py:413] - default_conf: client. -[2739486:MainThread](2026-02-26 22:59:30,854) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings. -[2739486:MainThread](2026-02-26 22:59:30,855) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')} -================================================================================ -DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE -================================================================================ -Date Range: 2020-01-02 to 2020-01-10 -Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data -Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/ - -Step 1: Loading data from Qlib pipeline... - Loading since_date=2020-01-02 - Loading data with handler (load_start=2019-12-13)... -Will use `placehorder_value` from module: qlib.contrib.data.config -Will init handler object from config: -{'data_handler_config': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'end_time': datetime.date(2026, 2, 26), - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-11-23 00:00:00')}, - 'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'handler': {'class': 'AggHandler', - 'kwargs': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - [2739486:MainThread](2026-02-26 22:59:30,878) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler -[2739486:MainThread](2026-02-26 22:59:30,878) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored -[2739486:MainThread](2026-02-26 22:59:30,938) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX -[2739486:MainThread](2026-02-26 23:07:16,353) INFO - qlib.timer - [log.py:117] - Time cost: 465.464s | DDB query: Done -[2739486:MainThread](2026-02-26 23:07:17,149) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -'end_time': datetime.date(2026, 2, 26), - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-11-23 00:00:00')}, - 'module_path': 'qlib.contrib.data.agg_handler'}, - 'load_end': datetime.date(2026, 2, 26), - 'load_start': Timestamp('2019-11-23 00:00:00'), - 'market': 'csiallx', - 'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target', - 'region': 'cn'}} -Query config: -#alpha158: 1; -Will use float32 for -[2739486:MainThread](2026-02-26 23:09:19,001) INFO - qlib.timer - [log.py:117] - Time cost: 112.707s | Instruments filter: Done -[2739486:MainThread](2026-02-26 23:09:26,016) INFO - qlib.timer - [log.py:117] - Time cost: 595.133s | Loading data () Done -[2739486:MainThread](2026-02-26 23:09:26,017) INFO - qlib.timer - [log.py:117] - Time cost: 595.135s | Init data () Done -[2739486:MainThread](2026-02-26 23:09:26,019) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2739486:MainThread](2026-02-26 23:09:29,432) INFO - qlib.timer - [log.py:117] - Time cost: 3.412s | fetch_df_by_index Done -[2739486:MainThread](2026-02-26 23:09:31,228) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None) - KMID KLEN ... VSUMD30 VSUMD60 -datetime instrument ... -2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125 - SH600004 -0.013806 0.030012 ... -0.017610 0.039195 - SH600006 0.009238 0.016166 ... -0.034782 -0.014306 - SH600007 -0.014749 0.018879 ... -0.032427 0.034279 - SH600008 0.009259 0.024691 ... -0.063490 0.003978 -... ... ... ... ... ... -2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708 - SZ301662 0.060584 0.087834 ... -0.014658 -0.014613 - SZ301665 -0.012899 0.040541 ... 0.083229 0.055994 - SZ301678 0.018182 0.027879 ... -0.054124 0.014202 - SZ302132 0.001754 0.016416 ... -0.049558 -0.038667 - -[6908346 rows x 158 columns] -[2739486:MainThread](2026-02-26 23:09:31,229) INFO - qlib.timer - [log.py:117] - Time cost: 5.211s | Fetching dataframe Done -[2739486:MainThread](2026-02-26 23:09:31,242) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from - loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3 -[2739486:MainThread](2026-02-26 23:09:54,142) INFO - qlib.timer - [log.py:117] - Time cost: 22.909s | DDB query: Done -[2739486:MainThread](2026-02-26 23:09:54,927) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2739486:MainThread](2026-02-26 23:09:55,507) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,con_rating_strength from - loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH -[2739486:MainThread](2026-02-26 23:10:10,691) INFO - qlib.timer - [log.py:117] - Time cost: 15.192s | DDB query: Done -[2739486:MainThread](2026-02-26 23:10:11,588) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2739486:MainThread](2026-02-26 23:11:37,528) INFO - qlib.timer - [log.py:117] - Time cost: 82.525s | Instruments filter: Done -[2739486:MainThread](2026-02-26 23:11:38,259) INFO - qlib.timer - [log.py:117] - Time cost: 127.029s | Loading data () Done -[2739486:MainThread](2026-02-26 23:11:38,260) INFO - qlib.timer - [log.py:117] - Time cost: 127.030s | Init data () Done -[2739486:MainThread](2026-02-26 23:11:38,261) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2739486:MainThread](2026-02-26 23:11:39,000) INFO - qlib.timer - [log.py:117] - Time cost: 0.738s | fetch_df_by_index Done -[2739486:MainThread](2026-02-26 23:11:39,009) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None) - turnover free_turnover log_size con_rating_strength -datetime instrument -2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214 - SH600004 0.9386 1.9173 15.039255 0.8125 - SH600006 0.2566 0.6479 13.680836 1.0000 - SH600007 0.1647 0.8513 14.335590 0.7500 - SH600008 0.1813 0.3362 14.435625 0.6875 -... ... ... ... ... -2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN - SZ301662 12.5950 12.5950 12.681215 1.0000 - SZ301665 14.0077 14.0077 11.719415 1.0000 - SZ301678 6.6518 6.6518 12.799973 0.7500 - SZ302132 1.3868 3.0296 15.359885 0.8750 - -[7623255 rows x 4 columns] -[2739486:MainThread](2026-02-26 23:11:39,010) INFO - qlib.timer - [log.py:117] - Time cost: 0.749s | Fetching dataframe Done -[2739486:MainThread](2026-02-26 23:11:39,191) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from - loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657', -[2739486:MainThread](2026-02-26 23:12:05,839) INFO - qlib.timer - [log.py:117] - Time cost: 26.825s | DDB query: Done -[2739486:MainThread](2026-02-26 23:12:06,554) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2739486:MainThread](2026-02-26 23:12:07,075) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from - loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281',' -[2739486:MainThread](2026-02-26 23:12:32,695) INFO - qlib.timer - [log.py:117] - Time cost: 25.629s | DDB query: Done -[2739486:MainThread](2026-02-26 23:12:33,566) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2739486:MainThread](2026-02-26 23:14:02,232) INFO - qlib.timer - [log.py:117] - Time cost: 85.158s | Instruments filter: Done -[2739486:MainThread](2026-02-26 23:14:03,155) INFO - qlib.timer - [log.py:117] - Time cost: 144.143s | Loading data () Done -[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 144.144s | Init data () Done -[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2739486:MainThread](2026-02-26 23:14:04,046) INFO - qlib.timer - [log.py:117] - Time cost: 0.889s | fetch_df_by_index Done -[2739486:MainThread](2026-02-26 23:14:04,060) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None) - IsZt IsDt IsN ... open_stop close_stop high_stop -datetime instrument ... -2019-11-25 SH600000 False False False ... False False False - SH600004 False False False ... False False False - SH600006 False False False ... False False False - SH600007 False False False ... False False False - SH600008 False False False ... False False False -... ... ... ... ... ... ... ... -2026-02-26 SZ301658 False False False ... False False False - SZ301662 False False False ... False False False - SZ301665 False False False ... False False False - SZ301678 False False False ... False False False - SZ302132 False False False ... False False False - -[6925320 rows x 12 columns] -[2739486:MainThread](2026-02-26 23:14:04,061) INFO - qlib.timer - [log.py:117] - Time cost: 0.904s | Fetching dataframe Done -[2739486:MainThread](2026-02-26 23:14:04,079) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from - loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S -[2739486:MainThread](2026-02-26 23:14:06,440) INFO - qlib.timer - [log.py:117] - Time cost: 2.370s | DDB query: Done -[2739486:MainThread](2026-02-26 23:14:06,448) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 -[2739486:MainThread](2026-02-26 23:15:23,146) INFO - qlib.timer - [log.py:117] - Time cost: 76.695s | Instruments filter: Done -[2739486:MainThread](2026-02-26 23:15:23,184) INFO - qlib.timer - [log.py:117] - Time cost: 79.120s | Loading data () Done -[2739486:MainThread](2026-02-26 23:15:23,185) INFO - qlib.timer - [log.py:117] - Time cost: 79.121s | Init data () Done -[2739486:MainThread](2026-02-26 23:15:23,186) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done -[2739486:MainThread](2026-02-26 23:15:23,190) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done -[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None) - gds_CC10 gds_CC11 ... gds_CC63 gds_CC70 -datetime instrument ... -2026-02-09 SH600000 False False ... False False - SH600004 False False ... False False - SH600006 False False ... False False - SH600007 False False ... False False - SH600008 False False ... False False -... ... ... ... ... ... -2026-02-26 SZ301658 False False ... False False - SZ301662 False False ... False False - SZ301665 False False ... False False - SZ301678 False False ... False False - SZ302132 False False ... False False - -[41168 rows x 30 columns] -[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done -[2739486:MainThread](2026-02-26 23:15:23,226) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from - loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag") - where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002 -[2739486:MainThread](2026-02-26 23:15:53,388) INFO - qlib.timer - [log.py:117] - Time cost: 30.171s | DDB query: Done -[2739486:MainThread](2026-02-26 23:15:54,166) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00 diff --git a/stock_1d/d033/alpha158_beta/scripts/run_simple.log b/stock_1d/d033/alpha158_beta/scripts/run_simple.log deleted file mode 100644 index c72ec48..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/run_simple.log +++ /dev/null @@ -1,104 +0,0 @@ -[2745445:MainThread](2026-02-26 23:18:06,410) INFO - qlib.Initialization - [config.py:413] - default_conf: client. -[2745445:MainThread](2026-02-26 23:18:06,414) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings. -[2745445:MainThread](2026-02-26 23:18:06,415) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')} -================================================================================ -DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE -================================================================================ -Date Range: 2020-01-02 to 2020-01-10 -Output Directory: ../data/ -Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/ - -Step 1: Loading raw data from Qlib pipeline... - Loading raw data from handler (load_start=2019-12-13)... -Will use `placehorder_value` from module: qlib.contrib.data.config -Will init handler object from config: -{'data_handler_config': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'end_time': datetime.date(2026, 2, 26), - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-11-23 00:00:00')}, - 'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'handler': {'class': 'AggHandler', - 'kwargs': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'end_time': datetime.date(2026, 2, 26), - 'handler_list': [{[2745445:MainThread](2026-02-26 23:18:06,436) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler -[2745445:MainThread](2026-02-26 23:18:06,437) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored -[2745445:MainThread](2026-02-26 23:18:06,492) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX diff --git a/stock_1d/d033/alpha158_beta/scripts/run_simple2.log b/stock_1d/d033/alpha158_beta/scripts/run_simple2.log deleted file mode 100644 index 8a868fe..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/run_simple2.log +++ /dev/null @@ -1,103 +0,0 @@ -[2746177:MainThread](2026-02-26 23:21:56,618) INFO - qlib.Initialization - [config.py:413] - default_conf: client. -[2746177:MainThread](2026-02-26 23:21:56,622) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings. -[2746177:MainThread](2026-02-26 23:21:56,623) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')} -================================================================================ -DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE -================================================================================ -Date Range: 2020-01-02 to 2020-01-10 -Output Directory: ../data/ -Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/ - -Step 1: Loading raw data from Qlib pipeline... - Loading raw data from handler (load_start=2019-12-13)... - Filtering instruments: ['SH600000', 'SH600004', 'SH600006', 'SH600007', 'SH600008']... (5 total) -Will use `placehorder_value` from module: qlib.contrib.data.config -Will init handler object from config: -{'data_handler_config': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'end_time': datetime.date(2026, 2, 26), - 'handler_list': [{'class': 'DDBAlpha158Handler', - 'kwargs': {'col_set': 'feature', - 'query_config': [{'alpha158_config': 'alpha158_expr.csv', - 'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': 'alpha158', - 'table_name': 'stg_1day_wind_alpha158_0_7'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'}, - {'class': 'DDBMarketExtHandler', - 'kwargs': {'col_set': 'feature_ext', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['Turnover ' - 'as ' - 'turnover', - 'FreeTurnover ' - 'as ' - 'free_turnover', - 'log(MarketValue) ' - 'as ' - 'log_size'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'float32', - 'field_list': ['con_rating_strength'], - 'table_name': 'stg_1day_gds_con_rating'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'}, - {'class': 'DDBMarketFlagHandler', - 'kwargs': {'col_set': 'feature_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['IsZt', - 'IsDt', - 'IsN', - 'IsXD', - 'IsXR', - 'IsDR'], - 'table_name': 'stg_1day_wind_kline_adjusted'}, - {'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['open_limit', - 'close_limit', - 'low_limit', - 'open_stop', - 'close_stop', - 'high_stop'], - 'table_name': 'stg_1day_wind_market_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'}, - {'class': 'DDBIndusFlagHandler', - 'kwargs': {'col_set': 'indus_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': 'industry_code_cc.csv', - 'table_name': 'stg_1day_gds_indus_flag_cc1'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'}, - {'class': 'DDBStFlagHandler', - 'kwargs': {'col_set': 'st_flag', - 'query_config': [{'db_path': 'dfs://daily_stock_run', - 'dtype': 'bool', - 'field_list': ['ST_Y', - 'ST_S', - 'ST_T', - 'ST_L', - 'ST_Z', - 'ST_X'], - 'table_name': 'stg_1day_wind_st_flag'}]}, - 'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}], - 'instruments': 'csiallx', - 'start_time': Timestamp('2019-11-23 00:00:00')}, - 'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}, - 'handler': {'class': 'AggHandler', - 'kwargs': {'ddb_config': {'host': '192.168.1.146', - 'password': '123456', - 'port': 8848, - 'username': 'admin'}[2746177:MainThread](2026-02-26 23:21:56,647) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler -[2746177:MainThread](2026-02-26 23:21:56,648) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored -[2746177:MainThread](2026-02-26 23:21:56,716) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146: - - use mytt; - select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX diff --git a/stock_1d/d033/alpha158_beta/scripts/verify_feature_order.py b/stock_1d/d033/alpha158_beta/scripts/verify_feature_order.py deleted file mode 100644 index d08a7b0..0000000 --- a/stock_1d/d033/alpha158_beta/scripts/verify_feature_order.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python -""" -Verify feature column order between standalone pipeline and qlib gold standard. - -This script: -1. Loads a small sample using the qlib pipeline -2. Runs the same sample through the standalone generate_beta_embedding pipeline -3. Compares the column order and feature values -""" - -import pickle as pkl -import ruamel.yaml as yaml -import pandas as pd -import polars as pl -import numpy as np -import sys -import os - -# Patch yaml.safe_load for compatibility -_yaml = yaml.YAML(typ='safe', pure=True) -def patched_safe_load(stream): - import io - if isinstance(stream, str): - stream = io.StringIO(stream) - return _yaml.load(stream) -yaml.safe_load = patched_safe_load - -# Add scripts directory to path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'scripts')) - -def main(): - print("=" * 70) - print("VERIFY FEATURE ORDER: Standalone vs Qlib Gold Standard") - print("=" * 70) - - # Step 1: Load processor list - print("\nStep 1: Loading processor list...") - proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc" - with open(proc_path, "rb") as f: - proc_list = pkl.load(f) - print(f" Loaded {len(proc_list)} processors") - - # Step 2: Load small sample from qlib pipeline - print("\nStep 2: Loading sample from qlib pipeline...") - - import qlib - from qlib.config import REG_CN - qlib.init(provider_uri='/home/guofu/.qlib/data_ops/target', region=REG_CN) - - from qlib.workflow.cli import sys_config - from qlib.utils import fill_placeholder - import datetime as dt - - yaml_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml" - with open(yaml_path) as fin: - config = yaml.safe_load(fin) - - sys_config(config, "qlib.contrib.data.config") - qlib.init(**config.get("qlib_init")) - - load_start = pd.to_datetime("2020-01-02") - dt.timedelta(days=20) - placehorder_value = { - "": load_start, - "": dt.date.today() - } - - config_filled = fill_placeholder(config, placehorder_value) - handler = qlib.init_instance_by_config(config_filled["handler"]) - handler_data = handler._data - - # Get data from SepDataFrame - if hasattr(handler_data, '_data'): - df_dict = handler_data._data - print(f" Handler groups: {list(df_dict.keys())}") - - # Concatenate groups - raw_dfs = [] - for group, df in df_dict.items(): - df_copy = df.copy() - df_copy.columns = [f"{group}::{col}" for col in df_copy.columns] - raw_dfs.append(df_copy) - print(f" {group}: {len(df_copy.columns)} columns") - - raw_df = pd.concat(raw_dfs, axis=1) - print(f" Raw concatenated shape: {raw_df.shape}") - - # Step 3: Apply processors to get gold standard features - print("\nStep 3: Applying processors (qlib gold standard)...") - from qlib.contrib.data.utils import apply_proc_list - - # Strip group prefixes for processor application - col_mapping = {col: col.split('::', 1)[1] for col in raw_df.columns if '::' in col} - raw_df_stripped = raw_df.rename(columns=col_mapping) - - # Convert bool to object for processor compatibility - bool_cols = raw_df_stripped.select_dtypes(include=['bool']).columns - for col in bool_cols: - raw_df_stripped[col] = raw_df_stripped[col].astype(object) - - df_gold = apply_proc_list(raw_df_stripped, proc_list=proc_list, with_fit=False) - print(f" Gold standard shape after processors: {df_gold.shape}") - - # Restore group prefixes - reverse_mapping = {v: k for k, v in col_mapping.items()} - df_gold = df_gold.rename(columns=reverse_mapping) - - # Get gold standard column order - gold_columns = list(df_gold.columns) - print(f"\nGold standard column groups:") - - feature_cols = [c for c in gold_columns if c.startswith('feature::')] - feature_ext_cols = [c for c in gold_columns if c.startswith('feature_ext::')] - feature_flag_cols = [c for c in gold_columns if c.startswith('feature_flag::')] - indus_idx_cols = [c for c in gold_columns if c.startswith('indus_idx::')] - - print(f" feature:: {len(feature_cols)} cols") - print(f" feature_ext:: {len(feature_ext_cols)} cols") - print(f" feature_flag:: {len(feature_flag_cols)} cols") - print(f" indus_idx:: {len(indus_idx_cols)} cols") - - # Step 4: Now run standalone pipeline on same data - print("\nStep 4: Running standalone pipeline...") - - # Load parquet data for same date range - from generate_beta_embedding import load_all_data, merge_data_sources, apply_feature_pipeline - - df_alpha, df_kline, df_flag, df_industry = load_all_data("2020-01-02", "2020-01-10") - df_standalone = merge_data_sources(df_alpha, df_kline, df_flag, df_industry) - - print(f" Standalone loaded shape: {df_standalone.shape}") - - # Apply feature pipeline - df_processed, feature_cols_standalone = apply_feature_pipeline(df_standalone) - print(f" Standalone processed shape: {df_processed.shape}") - print(f" Standalone feature columns: {len(feature_cols_standalone)}") - - # Step 5: Compare column counts - print("\n" + "=" * 70) - print("COMPARISON SUMMARY") - print("=" * 70) - - print(f"\nGold standard total columns: {len(gold_columns)}") - print(f" feature:: {len(feature_cols)}") - print(f" feature_ext:: {len(feature_ext_cols)}") - print(f" feature_flag:: {len(feature_flag_cols)}") - print(f" indus_idx:: {len(indus_idx_cols)}") - - print(f"\nStandalone feature columns: {len(feature_cols_standalone)}") - - # The gold standard columns (without prefix) should match standalone - gold_feature_cols = [c.split('::', 1)[1] for c in feature_cols] - gold_feature_ext_cols = [c.split('::', 1)[1] for c in feature_ext_cols] - gold_feature_flag_cols = [c.split('::', 1)[1] for c in feature_flag_cols] - gold_indus_idx_cols = [c.split('::', 1)[1] for c in indus_idx_cols] - - gold_all = gold_feature_cols + gold_feature_ext_cols + gold_feature_flag_cols + gold_indus_idx_cols - - print(f"\nGold standard (flat): {len(gold_all)} features") - print(f"Standalone: {len(feature_cols_standalone)} features") - - if len(gold_all) != len(feature_cols_standalone): - print(f"\nWARNING: Feature count mismatch! Difference: {len(gold_all) - len(feature_cols_standalone)}") - - # Check column order - print("\nFirst 20 column comparison:") - print(f"{'Idx':<5} {'Gold Standard':<40} {'Standalone':<40} {'Match':<6}") - print("-" * 90) - for i in range(min(20, len(gold_all), len(feature_cols_standalone))): - match = "✓" if gold_all[i] == feature_cols_standalone[i] else "✗" - print(f"{i:<5} {gold_all[i]:<40} {feature_cols_standalone[i]:<40} {match:<6}") - - # Check if orders match - if gold_all == feature_cols_standalone: - print("\n✓ Column order MATCHES!") - else: - print("\n✗ Column order DOES NOT MATCH!") - print("\nFinding differences...") - diff_count = 0 - for i in range(min(len(gold_all), len(feature_cols_standalone))): - if gold_all[i] != feature_cols_standalone[i]: - diff_count += 1 - if diff_count <= 20: - print(f" [{i}] Gold: {gold_all[i]} vs Standalone: {feature_cols_standalone[i]}") - print(f"Total differences: {diff_count}") - -if __name__ == "__main__": - main()