Upload weights, notebooks, sample images
Browse files- configs/end2end.yaml +15 -11
- configs/highlight_decoder_pretrain.yaml +6 -4
- configs/pretrained_config.yaml +2 -1
- configs/tokeninp_pretrain.yaml +43 -53
configs/end2end.yaml
CHANGED
|
@@ -31,7 +31,7 @@ parameters:
|
|
| 31 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 32 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 33 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
| 34 |
-
|
| 35 |
USE_BN: False # Use batch normalization in decoder
|
| 36 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 37 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
|
@@ -41,20 +41,24 @@ parameters:
|
|
| 41 |
TOKEN_INPAINTER:
|
| 42 |
TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
|
| 43 |
TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
|
| 44 |
-
# FROM_PRETRAINED: "token_inpainter.
|
| 45 |
-
TOKEN_INPAINTER_LR:
|
| 46 |
DEPTH: 6 # Number of transformer blocks
|
| 47 |
HEADS: 16 # Number of attention heads
|
| 48 |
DROP: 0.05 # Dropout rate
|
| 49 |
USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
|
| 50 |
USE_FINAL_NORM: True # Enable final LayerNorm before output projection
|
| 51 |
USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
|
| 52 |
-
LOCAL_PRIOR_WEIGHT: 0.
|
| 53 |
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
|
| 54 |
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
|
| 59 |
value: False
|
| 60 |
DISTRIBUTE:
|
|
@@ -128,10 +132,10 @@ parameters:
|
|
| 128 |
ALL_DATASETS:
|
| 129 |
FEW_IMAGES: False # Override FEW_IMAGES for all datasets (for quick debugging set True)
|
| 130 |
TARGET_SIZE: [896,896] # Override target image size [height, width] for all datasets
|
| 131 |
-
LOAD_RGB_ONLY: True
|
| 132 |
|
| 133 |
BATCH_SIZE: # Max batch size with img size 896 is 32
|
| 134 |
-
value:
|
| 135 |
NUM_WORKERS:
|
| 136 |
value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
|
| 137 |
SHUFFLE:
|
|
@@ -145,9 +149,9 @@ parameters:
|
|
| 145 |
MOGE_MODEL:
|
| 146 |
value: "Ruicheng/moge-2-vits-normal" # MoGe model name for normal estimation (HuggingFace format)
|
| 147 |
SURFACE_ROUGHNESS:
|
| 148 |
-
value:
|
| 149 |
INTENSITY:
|
| 150 |
-
value:
|
| 151 |
LIGHT_DISTANCE_RANGE:
|
| 152 |
value: [0.0, 1] # Range for light source distance sampling [min, max] (normalized)
|
| 153 |
LIGHT_LEFT_RIGHT_ANGLE:
|
|
|
|
| 31 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 32 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 33 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
| 34 |
+
FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained decoder weights (optional)
|
| 35 |
USE_BN: False # Use batch normalization in decoder
|
| 36 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 37 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
|
|
|
| 41 |
TOKEN_INPAINTER:
|
| 42 |
TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
|
| 43 |
TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
|
| 44 |
+
# FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
|
| 45 |
+
TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
|
| 46 |
DEPTH: 6 # Number of transformer blocks
|
| 47 |
HEADS: 16 # Number of attention heads
|
| 48 |
DROP: 0.05 # Dropout rate
|
| 49 |
USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
|
| 50 |
USE_FINAL_NORM: True # Enable final LayerNorm before output projection
|
| 51 |
USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
|
| 52 |
+
LOCAL_PRIOR_WEIGHT: 0.8 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
|
| 53 |
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
|
| 54 |
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
|
| 55 |
+
|
| 56 |
+
# FORWARD PASS PARAMETERS
|
| 57 |
+
INPAINT_MASK_THRESHOLD:
|
| 58 |
+
value: 0.2 # Threshold for inpaint mask
|
| 59 |
+
INPAINT_MASK_DILATION:
|
| 60 |
+
value: 40 # Dilation kernel size (pixels) for inpaint mask - Must be odd
|
| 61 |
+
|
| 62 |
USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
|
| 63 |
value: False
|
| 64 |
DISTRIBUTE:
|
|
|
|
| 132 |
ALL_DATASETS:
|
| 133 |
FEW_IMAGES: False # Override FEW_IMAGES for all datasets (for quick debugging set True)
|
| 134 |
TARGET_SIZE: [896,896] # Override target image size [height, width] for all datasets
|
| 135 |
+
LOAD_RGB_ONLY: True
|
| 136 |
|
| 137 |
BATCH_SIZE: # Max batch size with img size 896 is 32
|
| 138 |
+
value: 4 # Number of samples per batch (adjust based on GPU memory)
|
| 139 |
NUM_WORKERS:
|
| 140 |
value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
|
| 141 |
SHUFFLE:
|
|
|
|
| 149 |
MOGE_MODEL:
|
| 150 |
value: "Ruicheng/moge-2-vits-normal" # MoGe model name for normal estimation (HuggingFace format)
|
| 151 |
SURFACE_ROUGHNESS:
|
| 152 |
+
value: 100.0 # Blinn-Phong surface roughness exponent (higher = sharper highlights)
|
| 153 |
INTENSITY:
|
| 154 |
+
value: 0.8 # Specular highlight intensity multiplier
|
| 155 |
LIGHT_DISTANCE_RANGE:
|
| 156 |
value: [0.0, 1] # Range for light source distance sampling [min, max] (normalized)
|
| 157 |
LIGHT_LEFT_RIGHT_ANGLE:
|
configs/highlight_decoder_pretrain.yaml
CHANGED
|
@@ -18,7 +18,7 @@ parameters:
|
|
| 18 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 19 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 20 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
| 21 |
-
|
| 22 |
USE_BN: False # Use batch normalization in decoder
|
| 23 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 24 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
|
@@ -41,7 +41,9 @@ parameters:
|
|
| 41 |
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
|
| 42 |
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
|
| 43 |
INPAINT_MASK_DILATION:
|
| 44 |
-
value:
|
|
|
|
|
|
|
| 45 |
USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
|
| 46 |
value: False
|
| 47 |
DISTRIBUTE:
|
|
@@ -69,7 +71,7 @@ parameters:
|
|
| 69 |
|
| 70 |
|
| 71 |
BATCH_SIZE: # Max batch size with img size 896 is 32
|
| 72 |
-
value:
|
| 73 |
NUM_WORKERS:
|
| 74 |
value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
|
| 75 |
SHUFFLE:
|
|
@@ -105,7 +107,7 @@ parameters:
|
|
| 105 |
|
| 106 |
### OPTIMIZATION
|
| 107 |
EPOCHS:
|
| 108 |
-
value:
|
| 109 |
LEARNING_RATE:
|
| 110 |
value: 1.0e-4 # Base learning rate for optimizer
|
| 111 |
WEIGHT_DECAY:
|
|
|
|
| 18 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 19 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 20 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
| 21 |
+
FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained token inpainter weights (optional)
|
| 22 |
USE_BN: False # Use batch normalization in decoder
|
| 23 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 24 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
|
|
|
| 41 |
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
|
| 42 |
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
|
| 43 |
INPAINT_MASK_DILATION:
|
| 44 |
+
value: None # Dilation kernel size (pixels) for inpaint mask (None = compute based on image size)
|
| 45 |
+
INPAINT_MASK_THRESHOLD:
|
| 46 |
+
value: 0.2 # Inpaint selection threshold
|
| 47 |
USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
|
| 48 |
value: False
|
| 49 |
DISTRIBUTE:
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
BATCH_SIZE: # Max batch size with img size 896 is 32
|
| 74 |
+
value: 16 # Number of samples per batch (adjust based on GPU memory)
|
| 75 |
NUM_WORKERS:
|
| 76 |
value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
|
| 77 |
SHUFFLE:
|
|
|
|
| 107 |
|
| 108 |
### OPTIMIZATION
|
| 109 |
EPOCHS:
|
| 110 |
+
value: 40 # Maximum number of training epochs<
|
| 111 |
LEARNING_RATE:
|
| 112 |
value: 1.0e-4 # Base learning rate for optimizer
|
| 113 |
WEIGHT_DECAY:
|
configs/pretrained_config.yaml
CHANGED
|
@@ -18,7 +18,7 @@ parameters:
|
|
| 18 |
REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
|
| 19 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 20 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
|
| 21 |
-
|
| 22 |
USE_BN: False # Use batch normalization in decoder
|
| 23 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 24 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
|
@@ -31,6 +31,7 @@ parameters:
|
|
| 31 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 32 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 33 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
|
|
|
| 34 |
USE_BN: False # Use batch normalization in decoder
|
| 35 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 36 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
|
|
|
| 18 |
REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
|
| 19 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 20 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
|
| 21 |
+
FROM_PRETRAINED: "diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
|
| 22 |
USE_BN: False # Use batch normalization in decoder
|
| 23 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 24 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
|
|
|
| 31 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 32 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 33 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
| 34 |
+
FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained decoder weights (optional)
|
| 35 |
USE_BN: False # Use batch normalization in decoder
|
| 36 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 37 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
configs/tokeninp_pretrain.yaml
CHANGED
|
@@ -31,6 +31,7 @@ parameters:
|
|
| 31 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 32 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 33 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
|
|
|
| 34 |
USE_BN: False # Use batch normalization in decoder
|
| 35 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 36 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
|
@@ -51,8 +52,14 @@ parameters:
|
|
| 51 |
LOCAL_PRIOR_WEIGHT: 0.25 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
|
| 52 |
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
|
| 53 |
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
|
| 57 |
value: False
|
| 58 |
DISTRIBUTE:
|
|
@@ -61,89 +68,72 @@ parameters:
|
|
| 61 |
### DATA
|
| 62 |
DATASETS:
|
| 63 |
value:
|
|
|
|
| 64 |
SCRREAM:
|
| 65 |
-
VAL_SCENES: ["
|
| 66 |
-
TARGET_SIZE: [896,896] # Target image size [height, width] in pixels
|
| 67 |
RESIZE_MODE: "resize+crop" # Image resizing mode: "resize", "crop", "resize+crop", or "pad"
|
| 68 |
-
|
| 69 |
-
SAMPLE_EVERY_N: 2 # Load every Nth frame from each scene (1 = all frames, 4 = every 4th frame)
|
| 70 |
-
LOAD_RGB_ONLY: True # If True, ignore polarization data and load only RGB images
|
| 71 |
|
| 72 |
HOUSECAT6D:
|
| 73 |
VAL_SCENES: ["val_scene1","val_scene2"] # Validation scene names
|
| 74 |
-
TARGET_SIZE: [896,896] # Target image size [height, width]
|
| 75 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 76 |
-
|
| 77 |
-
SAMPLE_EVERY_N: 2 # Load every Nth frame
|
| 78 |
-
LOAD_RGB_ONLY: True # Ignore polarization data if True
|
| 79 |
|
| 80 |
CROMO:
|
| 81 |
TRAIN_SCENES: ["kitchen"] # Training scene names (list or string)
|
| 82 |
# VAL_SCENES: "station" # Validation scene names (optional)
|
| 83 |
-
TARGET_SIZE: [896,896] # Target image size [height, width]
|
| 84 |
RESIZE_MODE: "resize" # Image resizing mode
|
| 85 |
-
|
| 86 |
SAMPLE_EVERY_N: 2 # Load every Nth frame
|
| 87 |
-
LOAD_RGB_ONLY: True # Ignore polarization data if True
|
| 88 |
|
| 89 |
PSD:
|
| 90 |
TRAIN_SCENES: "PSD_Train" # Training scene name (string or list)
|
| 91 |
VAL_SCENES: "PSD_Val" # Validation scene name (string or list)
|
| 92 |
-
TARGET_SIZE: [896,896] # Target image size [height, width]
|
| 93 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 94 |
-
|
| 95 |
SAMPLE_EVERY_N: 1 # Load every Nth frame (1 = all frames)
|
| 96 |
-
LOAD_RGB_ONLY: True # Ignore polarization data if True
|
| 97 |
|
| 98 |
SCARED:
|
| 99 |
VAL_SCENES: ["v22","v23","v24","v25","v26","v27","v28","v29","v30","v31","v32","v33","v34"] # Validation scene names
|
| 100 |
-
TARGET_SIZE: [896,896] # Target image size [height, width]
|
| 101 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 102 |
SAMPLE_EVERY_N: 8 # Load every Nth frame
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
HIGHLIGHT_ENABLE: False # Enable highlight detection/processing in dataset
|
| 106 |
-
HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection (0-1)
|
| 107 |
-
HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
|
| 108 |
-
HIGHLIGHT_RECT_SIZE: [1000, 1000] # Size of highlight rectangle region [height, width]
|
| 109 |
-
HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
|
| 110 |
-
HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
|
| 111 |
-
|
| 112 |
STEREOMIS_TRACKING:
|
| 113 |
VAL_SCENES: ["P2_2"] # Validation scene names
|
| 114 |
-
TARGET_SIZE: [896,896] # Target image size [height, width]
|
| 115 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 116 |
-
SAMPLE_EVERY_N:
|
| 117 |
-
LOAD_RGB_ONLY: True # Ignore polarization data if True
|
| 118 |
-
FEW_IMAGES: False # Load only first 10 imagas if True
|
| 119 |
-
HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
|
| 120 |
-
HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
|
| 121 |
-
HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
|
| 122 |
-
HIGHLIGHT_RECT_SIZE: [800, 800] # Size of highlight rectangle region
|
| 123 |
-
HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
|
| 124 |
-
HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
|
| 125 |
|
| 126 |
CHOLEC80:
|
| 127 |
-
|
| 128 |
-
|
| 129 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 130 |
-
SAMPLE_EVERY_N:
|
| 131 |
-
LOAD_RGB_ONLY: True # Ignore polarization data if True
|
| 132 |
-
|
| 133 |
-
HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
|
| 134 |
-
HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
|
| 135 |
-
HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
|
| 136 |
-
HIGHLIGHT_RECT_SIZE: [800, 800] # Size of highlight rectangle region
|
| 137 |
-
HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
|
| 138 |
-
HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
|
| 139 |
|
| 140 |
SUNRGBD:
|
| 141 |
VAL_SCENES: ["realsense"] # Validation scene names
|
| 142 |
-
TARGET_SIZE: [896,896] # Target image size [height, width]
|
| 143 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 144 |
SAMPLE_EVERY_N: 4 # Load every Nth frame
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
FEW_IMAGES_ALL_DATASETS:
|
| 149 |
value: False # If True, override all datasets' FEW_IMAGES to True (for quick debugging across all datasets)
|
|
@@ -163,9 +153,9 @@ parameters:
|
|
| 163 |
MOGE_MODEL:
|
| 164 |
value: "Ruicheng/moge-2-vits-normal" # MoGe model name for normal estimation (HuggingFace format)
|
| 165 |
SURFACE_ROUGHNESS:
|
| 166 |
-
value:
|
| 167 |
INTENSITY:
|
| 168 |
-
value:
|
| 169 |
LIGHT_DISTANCE_RANGE:
|
| 170 |
value: [0.0, 1] # Range for light source distance sampling [min, max] (normalized)
|
| 171 |
LIGHT_LEFT_RIGHT_ANGLE:
|
|
|
|
| 31 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 32 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 33 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
| 34 |
+
FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained decoder weights (optional)
|
| 35 |
USE_BN: False # Use batch normalization in decoder
|
| 36 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 37 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
|
|
|
| 52 |
LOCAL_PRIOR_WEIGHT: 0.25 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
|
| 53 |
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
|
| 54 |
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
|
| 55 |
+
|
| 56 |
+
# FORWARD PASS PARAMETERS
|
| 57 |
+
INPAINT_MASK_THRESHOLD:
|
| 58 |
+
value: 0.2 # Threshold for inpaint mask
|
| 59 |
+
INPAINT_MASK_DILATION:
|
| 60 |
+
value: -1 # Dilation kernel size (pixels) for inpaint mask - Must be odd
|
| 61 |
+
|
| 62 |
+
|
| 63 |
USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
|
| 64 |
value: False
|
| 65 |
DISTRIBUTE:
|
|
|
|
| 68 |
### DATA
|
| 69 |
DATASETS:
|
| 70 |
value:
|
| 71 |
+
# Reserved key: key-value pairs here override the same keys for every dataset (per-dataset entries still override this).
|
| 72 |
SCRREAM:
|
| 73 |
+
VAL_SCENES: ["scene10", "scene04"] # List of validation scene names
|
|
|
|
| 74 |
RESIZE_MODE: "resize+crop" # Image resizing mode: "resize", "crop", "resize+crop", or "pad"
|
| 75 |
+
SAMPLE_EVERY_N: 6 # Load every Nth frame from each scene (1 = all frames, 4 = every 4th frame)
|
|
|
|
|
|
|
| 76 |
|
| 77 |
HOUSECAT6D:
|
| 78 |
VAL_SCENES: ["val_scene1","val_scene2"] # Validation scene names
|
|
|
|
| 79 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 80 |
+
SAMPLE_EVERY_N: 4 # Load every Nth frame
|
|
|
|
|
|
|
| 81 |
|
| 82 |
CROMO:
|
| 83 |
TRAIN_SCENES: ["kitchen"] # Training scene names (list or string)
|
| 84 |
# VAL_SCENES: "station" # Validation scene names (optional)
|
|
|
|
| 85 |
RESIZE_MODE: "resize" # Image resizing mode
|
|
|
|
| 86 |
SAMPLE_EVERY_N: 2 # Load every Nth frame
|
|
|
|
| 87 |
|
| 88 |
PSD:
|
| 89 |
TRAIN_SCENES: "PSD_Train" # Training scene name (string or list)
|
| 90 |
VAL_SCENES: "PSD_Val" # Validation scene name (string or list)
|
|
|
|
| 91 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
|
|
|
| 92 |
SAMPLE_EVERY_N: 1 # Load every Nth frame (1 = all frames)
|
|
|
|
| 93 |
|
| 94 |
SCARED:
|
| 95 |
VAL_SCENES: ["v22","v23","v24","v25","v26","v27","v28","v29","v30","v31","v32","v33","v34"] # Validation scene names
|
|
|
|
| 96 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 97 |
SAMPLE_EVERY_N: 8 # Load every Nth frame
|
| 98 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
STEREOMIS_TRACKING:
|
| 100 |
VAL_SCENES: ["P2_2"] # Validation scene names
|
|
|
|
| 101 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 102 |
+
SAMPLE_EVERY_N: 2 # Load every Nth frame
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
CHOLEC80:
|
| 105 |
+
TRAIN_SCENES: ["train"] # Validation scene names
|
| 106 |
+
VAL_SCENES: ["test"] # Validation scene names
|
| 107 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 108 |
+
SAMPLE_EVERY_N: 40 # Load every Nth frame
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
SUNRGBD:
|
| 111 |
VAL_SCENES: ["realsense"] # Validation scene names
|
|
|
|
| 112 |
RESIZE_MODE: "resize+crop" # Image resizing mode
|
| 113 |
SAMPLE_EVERY_N: 4 # Load every Nth frame
|
| 114 |
+
|
| 115 |
+
# SCANNET:
|
| 116 |
+
# TRAIN_SCENES: ["train"]
|
| 117 |
+
# VAL_SCENES: ["val"]
|
| 118 |
+
# RESIZE_MODE: "resize+crop"
|
| 119 |
+
# SAMPLE_EVERY_N: 5
|
| 120 |
+
|
| 121 |
+
# OPENIMAGESV7:
|
| 122 |
+
# TRAIN_SCENES: ["thescene"]
|
| 123 |
+
# # VAL_SCENES: [""]
|
| 124 |
+
# RESIZE_MODE: "resize+crop"
|
| 125 |
+
# SAMPLE_EVERY_N: 5
|
| 126 |
+
|
| 127 |
+
# ENDOSYNTH:
|
| 128 |
+
# TRAIN_SCENES: ["scene"]
|
| 129 |
+
# # VAL_SCENES: ["val"]
|
| 130 |
+
# RESIZE_MODE: "resize+crop"
|
| 131 |
+
# SAMPLE_EVERY_N: 1
|
| 132 |
+
|
| 133 |
+
ALL_DATASETS:
|
| 134 |
+
FEW_IMAGES: False # Override FEW_IMAGES for all datasets (for quick debugging set True)
|
| 135 |
+
TARGET_SIZE: [896,896] # Override target image size [height, width] for all datasets
|
| 136 |
+
LOAD_RGB_ONLY: True
|
| 137 |
|
| 138 |
FEW_IMAGES_ALL_DATASETS:
|
| 139 |
value: False # If True, override all datasets' FEW_IMAGES to True (for quick debugging across all datasets)
|
|
|
|
| 153 |
MOGE_MODEL:
|
| 154 |
value: "Ruicheng/moge-2-vits-normal" # MoGe model name for normal estimation (HuggingFace format)
|
| 155 |
SURFACE_ROUGHNESS:
|
| 156 |
+
value: 100.0 # Blinn-Phong surface roughness exponent (higher = sharper highlights)
|
| 157 |
INTENSITY:
|
| 158 |
+
value: 0.8 # Specular highlight intensity multiplier
|
| 159 |
LIGHT_DISTANCE_RANGE:
|
| 160 |
value: [0.0, 1] # Range for light source distance sampling [min, max] (normalized)
|
| 161 |
LIGHT_LEFT_RIGHT_ANGLE:
|