ml_trainer_env / server /models_nn.py
BART-ender's picture
Upload folder using huggingface_hub
8f24287 verified
"""
Neural network architectures for the ML Training Optimizer environment.
Provides three model architectures of increasing complexity:
- SimpleMLP: For MNIST (Easy task)
- SmallCNN: For FashionMNIST (Medium task)
- DeeperCNN: For CIFAR-10 (Hard task)
All models accept a configurable dropout rate set by the agent.
"""
import torch.nn as nn
class SimpleMLP(nn.Module):
"""
2-layer MLP for MNIST digit classification.
~100k parameters. Trains in ~0.5-1s per epoch on CPU with 4k samples.
Architecture: 784 → 128 → 64 → 10
"""
def __init__(self, dropout: float = 0.0):
super().__init__()
self.flatten = nn.Flatten()
self.layers = nn.Sequential(
nn.Linear(784, 128),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(128, 64),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(64, 10),
)
def forward(self, x):
x = self.flatten(x)
return self.layers(x)
class SmallCNN(nn.Module):
"""
Small CNN for FashionMNIST classification.
~200k parameters. Trains in ~1-2s per epoch on CPU with 6.5k samples.
Architecture: 2×(Conv→ReLU→MaxPool) → FC(128) → FC(10)
"""
def __init__(self, dropout: float = 0.0):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(64 * 7 * 7, 128),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(128, 10),
)
def forward(self, x):
x = self.features(x)
return self.classifier(x)
class DeeperCNN(nn.Module):
"""
Deeper CNN for CIFAR-10 classification.
~500k parameters. Trains in ~2-3s per epoch on CPU with 8k samples.
Architecture: 3×(Conv→BN→ReLU→MaxPool) → FC(256) → FC(10)
"""
def __init__(self, dropout: float = 0.0):
super().__init__()
self.features = nn.Sequential(
# Block 1: 3→32, 32×32 → 16×16
nn.Conv2d(3, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2),
# Block 2: 32→64, 16×16 → 8×8
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2),
# Block 3: 64→128, 8×8 → 4×4
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(2),
)
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 4 * 4, 256),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(256, 10),
)
def forward(self, x):
x = self.features(x)
return self.classifier(x)
def create_model(model_type: str, dropout: float = 0.0) -> nn.Module:
"""Factory function to create a model by name."""
models = {
"simple_mlp": SimpleMLP,
"small_cnn": SmallCNN,
"deeper_cnn": DeeperCNN,
}
if model_type not in models:
raise ValueError(f"Unknown model type: {model_type}. Choose from {list(models.keys())}")
return models[model_type](dropout=dropout)