## Production Distillation Config for RunPod (RTX A6000 48GB)
##
## 3-stage MambaInLlama pipeline with research-validated hyperparameters:
##   - SSM 10x LR boost (NB11)
##   - T=2.0 for KL distillation (NB09)
##   - alpha=0.7 for KL/CE balance (NB09)
##   - SVD split for MoE diversity (NB03)
##
## Teacher: CohereLabs/tiny-aya-global (3.35B, validated in NB01-NB11)
## Student: Aetheris HybridMambaMoE (~800M params)
## Data: ClimbMix (Stage 1-2), multilingual chat (Stage 3)
##
## Wayy Research, 2024-2026

# Teacher model (tiny-aya-global: 3.35B, 70+ langs, no gated access required)
teacher:
  name: "CohereLabs/tiny-aya-global"
  dtype: "bfloat16"
  device_map: "auto"

# Student model
student:
  config_path: "configs/student.yaml"
  dtype: "bfloat16"
  checkpoint: null  # Set to Stage 1 checkpoint for Stage 2, etc.

# Languages (10 core for multilingual equity tracking)
languages: [en, es, hi, zh, ar, sw, tr, ja, id, te]

# Seed
seed: 42

# --- Stage 0: Block Conversion ---
conversion:
  strategy: "weight_map"
  a_init: "exponential_decay"
  delta_init: "uniform"
  ffn_to_moe: "svd_split"    # Best diversity (CKA=0.097 vs replicate=0.88)

# --- Stage 1: Layer Alignment ---
stage1:
  enabled: true
  total_steps: 10000
  lr: 1.0e-4
  warmup_steps: 500
  batch_size: 4
  gradient_accumulation: 8
  gradient_checkpointing: true
  max_seq_len: 512
  loss_type: "mse+cosine"
  cka_threshold: 0.75
  cka_check_every: 500
  save_every: 1000
  log_every: 50
  output_dir: "checkpoints/stage1_alignment"

# --- Stage 2: KL Distillation ---
stage2:
  enabled: true
  total_steps: 20000
  lr: 5.0e-5                   # Base LR
  ssm_lr_multiplier: 10.0      # SSM blocks get 10x (NB11: KL -26%, agreement +12x)
  warmup_steps: 500
  batch_size: 4
  gradient_accumulation: 8
  gradient_checkpointing: true
  max_seq_len: 512
  temperature: 2.0              # NB09: T=2.0 good balance
  alpha: 0.7                    # NB09: alpha=0.7
  save_every: 2000
  log_every: 50
  output_dir: "checkpoints/stage2_kl"

# --- Stage 3: SFT ---
stage3:
  enabled: true
  total_steps: 5000
  lr: 2.0e-5
  warmup_steps: 200
  batch_size: 4
  gradient_accumulation: 4
  gradient_checkpointing: true
  max_seq_len: 1024
  save_every: 500
  log_every: 25
  output_dir: "checkpoints/stage3_sft"

# --- Data ---
data:
  # Stage 1 & 2: ClimbMix (retokenized with Aya vocab)
  climbmix:
    dataset: "nvidia/ClimbMix"
    mode: "retokenize"
    streaming: true
    buffer_size: 500
    min_tokens: 32

  # Stage 3: Multilingual chat data (aya_collection is non-gated)
  sft:
    dataset_name: "CohereForAI/aya_collection"
    streaming: true

# --- Evaluation ---
eval:
  max_new_tokens: 128
  temperature: 0.7
  top_p: 0.9
  output_dir: "results/runpod"