Spaces:

FINAL-Bench
/

test-old

Paused

App Files Files Community

test-old / model.py

SeaWolf-AI

Upload 6 files

ca19627 verified 11 days ago

raw

history blame contribute delete

8.37 kB

	"""
	AETHER-Net: Main Model
	Adaptive Elemental Transformer-Hybrid Efficient Recurrent Network

	25-layer hybrid LLM with 5×5 Latin orthogonal magic square layout
	and Oheng (五行) MoE routing.
	"""
	import torch
	import torch.nn as nn
	from typing import Dict, List, Optional, Tuple

	from config import AetherNetConfig, ELEMENTS, LAYER_TO_ELEMENT, ELEMENT_LAYERS
	from layers import RMSNorm, build_attention
	from oheng_moe import OhengMoE


	class AetherNetBlock(nn.Module):
	"""Single AETHER-Net transformer block.

	Structure:
	x → RMSNorm → Attention → residual → RMSNorm → OhengMoE → residual → out
	"""

	def __init__(self, config: AetherNetConfig, layer_idx: int):
	super().__init__()
	self.layer_idx = layer_idx
	self.layer_type = config.get_layer_type(layer_idx)
	self.element = config.get_layer_element(layer_idx)

	# Pre-norm
	self.input_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps)
	self.post_attention_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps)

	# Attention (type determined by magic square)
	self.attention = build_attention(self.layer_type, config)

	# MoE FFN with Oheng routing
	self.moe = OhengMoE(config, layer_idx)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.Tensor] = None,
	element_states: Optional[Dict[str, torch.Tensor]] = None,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	# Attention block with residual
	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)
	hidden_states = self.attention(
	hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	encoder_hidden_states=encoder_hidden_states,
	)
	hidden_states = residual + hidden_states

	# MoE FFN block with residual
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.moe(hidden_states, element_states=element_states)
	hidden_states = residual + hidden_states

	return hidden_states


	class AetherNetModel(nn.Module):
	"""AETHER-Net Language Model.

	Architecture:
	- Embedding → 25 × AetherNetBlock → RMSNorm → LM Head
	- Blocks arranged in 5×5 Latin orthogonal magic square
	- Oheng MoE with 상생 generate and 상극 overcome connections
	- Element states flow between element groups for structural self-verification
	"""

	def __init__(self, config: AetherNetConfig):
	super().__init__()
	self.config = config

	# Token embedding
	self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)

	# 25 transformer blocks
	self.layers = nn.ModuleList([
	AetherNetBlock(config, layer_idx=i)
	for i in range(config.num_layers)
	])

	# Final norm
	self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps)

	# LM Head
	self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

	# Weight tying
	if config.tie_word_embeddings:
	self.lm_head.weight = self.embed_tokens.weight

	# Initialize
	self.apply(self._init_weights)

	def _init_weights(self, module):
	if isinstance(module, nn.Linear):
	nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	) -> Dict[str, torch.Tensor]:
	B, L = input_ids.shape

	# Position IDs
	if position_ids is None:
	position_ids = torch.arange(L, device=input_ids.device).unsqueeze(0).expand(B, -1)

	# Embed
	hidden_states = self.embed_tokens(input_ids)

	# ── Element state tracking for Oheng connections ──
	# Each element group accumulates its output for 상생/상극 routing
	element_states: Dict[str, torch.Tensor] = {}
	element_layer_counts: Dict[str, int] = {e: 0 for e in ELEMENTS}

	# ── Forward through 25 layers ──
	for i, layer in enumerate(self.layers):
	element = LAYER_TO_ELEMENT[i]

	hidden_states = layer(
	hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	element_states=element_states,
	encoder_hidden_states=encoder_hidden_states,
	)

	# Update element state (running average of this element's layer outputs)
	element_layer_counts[element] += 1
	count = element_layer_counts[element]
	if element in element_states:
	# Exponential moving average of element's outputs
	element_states[element] = (
	element_states[element] * (count - 1) / count
	+ hidden_states.detach() / count
	)
	else:
	element_states[element] = hidden_states.detach()

	# Final norm
	hidden_states = self.norm(hidden_states)

	# LM Head
	logits = self.lm_head(hidden_states)

	# Loss
	loss = None
	if labels is not None:
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:].contiguous()
	loss = nn.functional.cross_entropy(
	shift_logits.view(-1, self.config.vocab_size),
	shift_labels.view(-1),
	ignore_index=-100,
	)

	return {
	"loss": loss,
	"logits": logits,
	"element_states": element_states,
	}

	def count_parameters(self) -> Dict[str, int]:
	"""Count parameters by component."""
	counts = {
	"embedding": sum(p.numel() for p in self.embed_tokens.parameters()),
	"lm_head": sum(p.numel() for p in self.lm_head.parameters()),
	"norm": sum(p.numel() for p in self.norm.parameters()),
	}

	attn_total = 0
	moe_total = 0
	generate_total = 0
	overcome_total = 0

	for layer in self.layers:
	attn_total += sum(p.numel() for p in layer.attention.parameters())
	attn_total += sum(p.numel() for p in layer.input_layernorm.parameters())
	attn_total += sum(p.numel() for p in layer.post_attention_layernorm.parameters())

	moe_total += sum(p.numel() for p in layer.moe.experts.parameters())
	moe_total += sum(p.numel() for p in layer.moe.shared_expert.parameters())
	moe_total += sum(p.numel() for p in layer.moe.router.parameters())

	if layer.moe.generate_boost is not None:
	generate_total += sum(p.numel() for p in layer.moe.generate_boost.parameters())
	if layer.moe.overcome_gate is not None:
	overcome_total += sum(p.numel() for p in layer.moe.overcome_gate.parameters())

	counts["attention_layers"] = attn_total
	counts["moe_experts"] = moe_total
	counts["oheng_generate"] = generate_total
	counts["oheng_overcome"] = overcome_total
	counts["total"] = sum(counts.values())

	return counts

	def get_layer_map(self) -> List[Dict]:
	"""Return human-readable layer map for diagnostics."""
	result = []
	for i, layer in enumerate(self.layers):
	result.append({
	"layer": i,
	"type": layer.layer_type,
	"element": layer.element,
	"element_idx": ELEMENTS.index(layer.element),
	"phase": i % 5,
	"attn_class": layer.attention.__class__.__name__,
	})
	return result