| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import torch |
| | from collections import namedtuple |
| |
|
| | from kernels import get_kernel |
| |
|
| | |
| | torch.manual_seed(42) |
| | torch.cuda.manual_seed(42) |
| |
|
| | |
| | megablocks = get_kernel("kernels-community/megablocks") |
| | print("MegaBlocks kernel downloaded successfully.") |
| |
|
| | model = megablocks.layers.MegaBlocksMoeMLP() |
| | model.experts = namedtuple("Experts", ["gate_up_proj", "gate_down_proj", "down_proj", "hidden_size"]) |
| | print("MegaBlocksMoeMLP instance created successfully.") |
| |
|
| | |
| | ne, hs, isz = 128, 1152, 3072 |
| |
|
| | |
| | model.router = torch.nn.Linear(hs, ne, device="cuda") |
| | torch.nn.init.kaiming_uniform_(model.router.weight) |
| |
|
| | |
| | e = model.experts |
| | e.gate_up_proj = torch.nn.Parameter(torch.randn(ne, hs, isz, device="cuda") * 0.02) |
| | e.gate_up_proj_bias = torch.nn.Parameter(torch.zeros(ne, isz, device="cuda")) |
| | e.down_proj = torch.nn.Parameter(torch.randn(ne, 1536, hs, device="cuda") * 0.02) |
| | e.down_proj_bias = torch.nn.Parameter(torch.zeros(ne, hs, device="cuda")) |
| | e.hidden_size = hs |
| | print("Expert layers initialized successfully.") |
| |
|
| | |
| | x = torch.randn(1, 1, hs, device="cuda") * 0.1 |
| | output, expert_weights = model(x) |
| | print("Model forward pass completed successfully.") |
| |
|
| | print(f"Output shape: {output.shape}") |
| | print(f"Output range: [{output.min():.3f}, {output.max():.3f}]") |
| | print(f"Output: {output.flatten()[:10]}") |
| | print(f"Expert weights sum: {expert_weights.sum():.3f}") |