### BASELINE: CONVERGES AFTER LONG parameters: ### MODEL ARCHITECTURE MODEL: value: MODEL_CLASS: "UnReflect_Model_TokenInpainter" # Main model class name (must match class in models.py) MODEL_MODULE: "models" # Module name to import model classes from (default: "models") RGB_ENCODER: ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format) IMAGE_SIZE: 896 # Input image size (height and width in pixels) RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed) RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set) DECODERS: diffuse: FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output) REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly) REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.) # FROM_PRETRAINED: "diffuse_decoder.pt" # Path to pretrained decoder weights (optional) USE_BN: False # Use batch normalization in decoder DROPOUT: 0.1 # Dropout rate in decoder layers OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width] OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image) DECODER_LR: 0.0 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR) NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0) TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0) highlight: FEATURE_DIM: 1024 # Feature dimension for highlight decoder REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage READOUT_TYPE: "ignore" # Readout type for DPT decoder # FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained decoder weights (optional) USE_BN: False # Use batch normalization in decoder DROPOUT: 0.1 # Dropout rate in decoder layers OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width] OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask) DECODER_LR: 5.0e-4 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR) NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0) TOKEN_INPAINTER: TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from # FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional) TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR) DEPTH: 6 # Number of transformer blocks HEADS: 16 # Number of attention heads DROP: 0.05 # Dropout rate USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings USE_FINAL_NORM: True # Enable final LayerNorm before output projection USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds LOCAL_PRIOR_WEIGHT: 0.5 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean) LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1) SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training