ifieryarrows commited on
Commit
f722ea0
Β·
verified Β·
1 Parent(s): a1bedd7

Sync from GitHub (tests passed)

Browse files
deep_learning/config.py CHANGED
@@ -73,8 +73,10 @@ class TFTModelConfig:
73
  hidden_size: int = 32
74
  # attention_head_size 4β†’2: fewer heads for a small, single-series dataset.
75
  attention_head_size: int = 2
76
- # dropout 0.1β†’0.3: 313 samples / ~900K params still demands heavy regularisation.
77
- dropout: float = 0.3
 
 
78
  hidden_continuous_size: int = 16 # was 32; paired reduction with hidden_size
79
  quantiles: tuple[float, ...] = (0.02, 0.10, 0.25, 0.50, 0.75, 0.90, 0.98)
80
  # lr 1e-3β†’3e-4: smaller batches produce noisier gradients; conservative LR
 
73
  hidden_size: int = 32
74
  # attention_head_size 4β†’2: fewer heads for a small, single-series dataset.
75
  attention_head_size: int = 2
76
+ # dropout 0.1β†’0.20 (REG-2026-001): 313 samples with dropout<0.20 caused
77
+ # co-adaptation and memorization. Kept ≀0.35 because higher values with
78
+ # small hidden_size collapse the output range.
79
+ dropout: float = 0.20
80
  hidden_continuous_size: int = 16 # was 32; paired reduction with hidden_size
81
  quantiles: tuple[float, ...] = (0.02, 0.10, 0.25, 0.50, 0.75, 0.90, 0.98)
82
  # lr 1e-3β†’3e-4: smaller batches produce noisier gradients; conservative LR
deep_learning/training/hyperopt.py CHANGED
@@ -47,10 +47,13 @@ def create_trial_config(trial, base_cfg: TFTASROConfig) -> TFTASROConfig:
47
  # compressing output distribution and preventing amplitude learning.
48
  hidden_size=trial.suggest_int("hidden_size", 32, 64, step=16),
49
  attention_head_size=trial.suggest_int("attention_head_size", 1, 4),
50
- # Cap at 0.35: dropout=0.5 with small hidden_size collapses the output
51
- # range β€” the model physically cannot produce large predictions.
52
- dropout=trial.suggest_float("dropout", 0.1, 0.35, step=0.05),
53
- hidden_continuous_size=trial.suggest_int("hidden_continuous_size", 8, 32, step=8),
 
 
 
54
  quantiles=base_cfg.model.quantiles,
55
  # Range [1e-4, 1e-3]: LR < 1e-4 produces near-zero pred_std (VR=0.14);
56
  # LR > 1e-3 causes 1-epoch divergence. This band is the stable zone.
@@ -71,9 +74,9 @@ def create_trial_config(trial, base_cfg: TFTASROConfig) -> TFTASROConfig:
71
  training_cfg = TrainingConfig(
72
  max_epochs=50,
73
  early_stopping_patience=8,
74
- # Include 16 which gives 19 batches/epoch (vs 4 at batch_size=64)
75
- # β€” more gradient steps per epoch β†’ more stable convergence.
76
- batch_size=trial.suggest_categorical("batch_size", [16, 32, 64]),
77
  val_ratio=base_cfg.training.val_ratio,
78
  test_ratio=base_cfg.training.test_ratio,
79
  lookback_days=base_cfg.training.lookback_days,
@@ -202,13 +205,46 @@ def _objective(trial, base_cfg: TFTASROConfig, master_data: tuple) -> float:
202
 
203
  trial.set_user_attr("variance_ratio", round(vr, 4))
204
  trial.set_user_attr("pred_std", round(pred_std, 6))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  except Exception as exc:
206
- logger.debug("Trial %d variance check failed: %s", trial.number, exc)
 
207
 
208
- score = float(val_loss) + variance_penalty
209
  logger.info(
210
- "Trial %d: val_loss=%.4f vr_penalty=%.4f β†’ score=%.4f",
211
- trial.number, float(val_loss), variance_penalty, score,
212
  )
213
  return score
214
 
 
47
  # compressing output distribution and preventing amplitude learning.
48
  hidden_size=trial.suggest_int("hidden_size", 32, 64, step=16),
49
  attention_head_size=trial.suggest_int("attention_head_size", 1, 4),
50
+ # Floor at 0.20: 313 samples with dropout<0.20 causes co-adaptation
51
+ # and memorization (REG-2026-001). Cap at 0.35: dropout>0.35 with
52
+ # small hidden_size collapses the output range.
53
+ dropout=trial.suggest_float("dropout", 0.20, 0.35, step=0.05),
54
+ # Cap at 24: hidden_cont=32 doubled the VSN parameter surface and
55
+ # contributed to overfitting in the 31-Mar regression.
56
+ hidden_continuous_size=trial.suggest_int("hidden_continuous_size", 8, 24, step=8),
57
  quantiles=base_cfg.model.quantiles,
58
  # Range [1e-4, 1e-3]: LR < 1e-4 produces near-zero pred_std (VR=0.14);
59
  # LR > 1e-3 causes 1-epoch divergence. This band is the stable zone.
 
74
  training_cfg = TrainingConfig(
75
  max_epochs=50,
76
  early_stopping_patience=8,
77
+ # 16 gives 19 batches/epoch, 32 gives ~10. 64 produced only 4
78
+ # batches/epoch with noisy gradients β€” removed after REG-2026-001.
79
+ batch_size=trial.suggest_categorical("batch_size", [16, 32]),
80
  val_ratio=base_cfg.training.val_ratio,
81
  test_ratio=base_cfg.training.test_ratio,
82
  lookback_days=base_cfg.training.lookback_days,
 
205
 
206
  trial.set_user_attr("variance_ratio", round(vr, 4))
207
  trial.set_user_attr("pred_std", round(pred_std, 6))
208
+
209
+ # --- Directional accuracy & Sharpe guard (REG-2026-001) ---
210
+ # Prevents Optuna from selecting configs that overfit training data
211
+ # but fail to generalise directionally.
212
+ pred_sign = np.sign(y_pred[:n])
213
+ actual_sign = np.sign(y_actual[:n])
214
+ da = float(np.mean(pred_sign == actual_sign))
215
+ trial.set_user_attr("directional_accuracy", round(da, 4))
216
+
217
+ # Strategy Sharpe on validation set
218
+ strategy_returns = np.sign(y_pred[:n]) * y_actual[:n]
219
+ sr_mean = float(strategy_returns.mean())
220
+ sr_std = float(strategy_returns.std()) + 1e-9
221
+ val_sharpe = sr_mean / sr_std
222
+ trial.set_user_attr("val_sharpe", round(val_sharpe, 4))
223
+
224
+ # Hard prune: negative Sharpe = systematically wrong direction
225
+ if val_sharpe < 0.0:
226
+ logger.warning(
227
+ "Trial %d PRUNED: negative val_sharpe=%.4f (DA=%.2f%%)",
228
+ trial.number, val_sharpe, da * 100,
229
+ )
230
+ import optuna
231
+ raise optuna.exceptions.TrialPruned()
232
+
233
+ # Soft penalty: DA below coin-flip adds heavy cost
234
+ da_penalty = 0.0
235
+ if da < 0.50:
236
+ da_penalty = 2.0 * (0.50 - da)
237
+ logger.info("Trial %d: DA=%.2f%% < 50%% β†’ da_penalty=%.4f",
238
+ trial.number, da * 100, da_penalty)
239
+
240
  except Exception as exc:
241
+ logger.debug("Trial %d variance/DA check failed: %s", trial.number, exc)
242
+ da_penalty = 0.0
243
 
244
+ score = float(val_loss) + variance_penalty + da_penalty
245
  logger.info(
246
+ "Trial %d: val_loss=%.4f vr_penalty=%.4f da_penalty=%.4f β†’ score=%.4f",
247
+ trial.number, float(val_loss), variance_penalty, da_penalty, score,
248
  )
249
  return score
250