### BASELINE: CONVERGES AFTER LONG

parameters:

  ### MODEL ARCHITECTURE                                                       
  MODEL:
    value:
      MODEL_CLASS: "UnReflect_Model_TokenInpainter"  # Main model class name (must match class in models.py)
      MODEL_MODULE: "models"  # Module name to import model classes from (default: "models")
      RGB_ENCODER:
        ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m"  # DINOv3 encoder model name (HuggingFace format)
        IMAGE_SIZE: 896  # Input image size (height and width in pixels)
        RETURN_SELECTED_LAYERS: [3, 6, 9, 12]  # Transformer layer indices to extract features from (0-indexed)
        RGB_ENCODER_LR: 0.0  # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
      DECODERS:
        diffuse:
          FEATURE_DIM: 1024  # Feature dimension for decoder (should match encoder output)
          REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048]  # Output channels for each decoder stage (DPT-style reassembly)
          REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5]  # Spatial upsampling factors for each stage
          READOUT_TYPE: "ignore"  # Readout type for DPT decoder ("ignore", "project", etc.)
          # FROM_PRETRAINED: "diffuse_decoder.pt"  # Path to pretrained decoder weights (optional)
          USE_BN: False  # Use batch normalization in decoder
          DROPOUT: 0.1  # Dropout rate in decoder layers
          OUTPUT_IMAGE_SIZE: [896,896]  # Output image resolution [height, width]
          OUTPUT_CHANNELS: 3  # Number of output channels (3 for RGB diffuse image)
          DECODER_LR: 0.0 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
          NUM_FUSION_BLOCKS_TRAINABLE: 1  # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
          TRAIN_RGB_HEAD: True  # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
        highlight:
          FEATURE_DIM: 1024  # Feature dimension for highlight decoder
          REASSEMBLE_OUT_CHANNELS: [96,192,384,768]  # Output channels for each decoder stage
          REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5]  # Spatial upsampling factors for each stage
          READOUT_TYPE: "ignore"  # Readout type for DPT decoder
          # FROM_PRETRAINED: "highlight_decoder.pt"  # Path to pretrained decoder weights (optional)
          USE_BN: False  # Use batch normalization in decoder
          DROPOUT: 0.1  # Dropout rate in decoder layers
          OUTPUT_IMAGE_SIZE: [896,896]  # Output image resolution [height, width]
          OUTPUT_CHANNELS: 1  # Number of output channels (1 for highlight mask)
          DECODER_LR: 5.0e-4  # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
          NUM_FUSION_BLOCKS_TRAINABLE: null  # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
      TOKEN_INPAINTER:
        TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior"  # Token inpainter class name
        TOKEN_INPAINTER_MODULE: "token_inpainters"  # Module name to import token inpainter from
        # FROM_PRETRAINED: "token_inpainter.pth"  # Path to pretrained token inpainter weights (optional)
        TOKEN_INPAINTER_LR: 1.0e-4  # Learning rate for token inpainter (can differ from base LR)
        DEPTH: 6  # Number of transformer blocks
        HEADS: 16  # Number of attention heads
        DROP: 0.05 # Dropout rate 
        USE_POSITIONAL_ENCODING: True  # Enable 2D sinusoidal positional encodings
        USE_FINAL_NORM: True  # Enable final LayerNorm before output projection
        USE_LOCAL_PRIOR: True  # Blend local mean prior for masked seeds
        LOCAL_PRIOR_WEIGHT: 0.5  # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
        LOCAL_PRIOR_KERNEL: 5  # Kernel size for local prior blending (> 1)
        SEED_NOISE_STD: 0.02  # Standard deviation of noise added to masked seeds during training