| import torch |
| import torch.nn as nn |
| from transformers import PreTrainedModel |
| from .configuration_pixel import TopAIImageConfig |
|
|
| class ResidualBlock(nn.Module): |
| def __init__(self, channels): |
| super().__init__() |
| self.block = nn.Sequential( |
| nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False), |
| nn.BatchNorm2d(channels), |
| nn.ReLU(True), |
| nn.Conv2d(channels, channels, kernel_size=3, padding=1, bias=False), |
| nn.BatchNorm2d(channels) |
| ) |
|
|
| def forward(self, x): |
| return x + self.block(x) |
|
|
| class TopAIImageGenerator(PreTrainedModel): |
| config_class = TopAIImageConfig |
| |
| |
| all_tied_weights_keys = {} |
|
|
| def __init__(self, config): |
| super().__init__(config) |
| |
| h = config.hidden_dim |
| |
| self.text_projection = nn.Linear(config.input_dim, 4 * 4 * h) |
| |
| |
| self.decoder = nn.Sequential( |
| |
| self._upsample(h, h), |
| |
| ResidualBlock(h), |
| |
| self._upsample(h, 256), |
| |
| ResidualBlock(256), |
| |
| self._upsample(256, 128), |
| |
| self._upsample(128, 64), |
| |
| nn.ConvTranspose2d(64, 32, 4, 2, 1, bias=False), |
| nn.BatchNorm2d(32), |
| nn.ReLU(True), |
| |
| nn.Conv2d(32, config.image_channels, kernel_size=3, padding=1), |
| nn.Tanh() |
| ) |
|
|
| def _upsample(self, i, o): |
| return nn.Sequential( |
| nn.ConvTranspose2d(i, o, 4, 2, 1, bias=False), |
| nn.BatchNorm2d(o), |
| nn.ReLU(True) |
| ) |
|
|
| def forward(self, text_embeddings): |
| |
| x = self.text_projection(text_embeddings) |
| x = x.view(-1, self.config.hidden_dim, 4, 4) |
| return self.decoder(x) |