Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +129 -0
config.json +57 -0
config.py +79 -0
generation_config.json +4 -0
model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,129 @@

+---
+license: apache-2.0
+base_model:
+- Qwen/Qwen3-235B-A22B
+tags:
+- redhat
+- neuralmagic
+- qwen
+- speculators
+- eagle3
+---
+# Qwen30235B-A22B-speculator
+## Model Overview
+- **Verifier:** Qwen/Qwen3-235B-A22B
+- **Speculative Decoding Algorithm:** EAGLE-3
+- **Model Architecture:** Eagle3Speculator
+- **Release Date:** 01/15/2026
+- **Version:** 1.0
+- **Model Developers:** RedHat
+This is a speculator model designed for use with [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B), based on the [EAGLE-3](https://arxiv.org/abs/2503.01840) speculative decoding algorithm.
+It was trained using the [speculators](https://github.com/neuralmagic/speculators) library on a combination of the [Magpie-Align/Magpie-Llama-3.1-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-300K-Filtered) dataset and the `train_sft` split of the [HuggingFaceH4/ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) dataset.
+This model should be used with the [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) chat template, specifically through the `/chat/completions` endpoint.  It was trained with thinking model on.
+## Use with vLLM
+```bash
+vllm serve Qwen/Qwen3-235B-A22B \
+  -tp 8 \
+  --speculative-config '{
+    "model": "RedHatAI/Qwen3-235B-A22B-speculator.eagle3",
+    "num_speculative_tokens": 3,
+    "method": "eagle3"
+  }'
+```
+## Evaluations
+<h3>Use cases</h3>
+<table>
+  <thead>
+    <tr>
+      <th>Use Case</th>
+      <th>Dataset</th>
+      <th>Number of Samples</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Coding</td>
+      <td>HumanEval</td>
+      <td>168</td>
+    </tr>
+    <tr>
+      <td>Math Reasoning</td>
+      <td>gsm8k</td>
+      <td>80</td>
+    </tr>
+    <tr>
+      <td>Text Summarization</td>
+      <td>CNN/Daily Mail</td>
+      <td>80</td>
+    </tr>
+  </tbody>
+</table>
+<h3>Acceptance lengths</h3>
+<table>
+  <thead>
+    <tr>
+      <th>Use Case</th>
+      <th>k=1</th>
+      <th>k=2</th>
+      <th>k=3</th>
+      <th>k=4</th>
+     <th>k=5</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Coding</td>
+      <td>1.77</td>
+      <td>2.19</td>
+      <td>2.51</td>
+      <td>2.72</td>
+      <td>2.83</td>
+    </tr>
+    <tr>
+      <td>Math Reasoning</td>
+      <td>1.77</td>
+      <td>2.33</td>
+      <td>2.73</td>
+      <td>3.03</td>
+      <td>3.24</td>
+    </tr>
+    <tr>
+      <td>Text Summarization</td>
+      <td>1.63</td>
+      <td>2.00</td>
+      <td>2.22</td>
+      <td>2.34</td>
+      <td>2.40</td>
+    </tr>
+  </tbody>
+</table>
+<details> <summary>Details</summary>
+<strong>Configuration</strong>
+- repetitions: 1
+- time per experiment: 10min
+- hardware: 8xA100
+- vLLM version: 0.11.2
+- GuideLLM version: 0.3.0
+<strong>Command</strong>
+```bash
+GUIDELLM__PREFERRED_ROUTE="chat_completions" \
+guidellm benchmark \
+  --target "http://localhost:8000/v1" \
+  --data "RedHatAI/speculator_benchmarks" \
+  --data-args '{"data_files": "HumanEval.jsonl"}' \
+  --rate-type sweep \
+  --max-seconds 600 \
+  --output-path "Qwen235B-HumanEval.json" \
+</details>

config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "architectures": [
+    "Eagle3DraftModel"
+  ],
+  "auto_map": {
+    "": "config.Eagle3SpeculatorConfig"
+  },
+  "base_model_ep_plan": null,
+  "draft_vocab_size": 64000,
+  "dtype": "float32",
+  "eagle_aux_hidden_state_layer_ids": null,
+  "has_no_defaults_at_init": false,
+  "norm_before_residual": true,
+  "speculators_config": {
+    "algorithm": "eagle3",
+    "default_proposal_method": "greedy",
+    "proposal_methods": [
+      {
+        "accept_tolerance": 0.0,
+        "proposal_type": "greedy",
+        "speculative_tokens": 3,
+        "verifier_accept_k": 1
+      }
+    ],
+    "verifier": {
+      "architectures": [
+        "LlamaForCausalLM"
+      ],
+      "name_or_path": "Qwen/Qwen3-235B-A22B"
+    }
+  },
+  "speculators_model_type": "eagle3",
+  "speculators_version": "0.4.0.dev12",
+  "target_hidden_size": null,
+  "transformer_layer_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 40960,
+    "mlp_bias": false,
+    "model_type": "llama",
+    "num_attention_heads": 64,
+    "num_hidden_layers": 1,
+    "num_key_value_heads": 4,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 10000.0,
+    "use_cache": true,
+    "vocab_size": 151936
+  },
+  "transformers_version": "4.57.1"
+}

config.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from typing import Any, Literal
+from pydantic import Field, field_serializer, field_validator
+from transformers import AutoConfig, PretrainedConfig
+from transformers.models.llama.configuration_llama import LlamaConfig
+from speculators import SpeculatorModelConfig
+__all__ = [
+    "Eagle3SpeculatorConfig",
+]
+@SpeculatorModelConfig.register("eagle3")
+class Eagle3SpeculatorConfig(SpeculatorModelConfig):
+    """
+    Configuration for EAGLE-3 speculator with vocabulary mapping.
+    EAGLE-3 features vocabulary mapping between draft (32K) and target (128K)
+    vocabularies, enabling cross-tokenizer speculation.
+    :param transformer_layer_config: Configuration for the transformer decoder layer
+    :param draft_vocab_size: Size of draft model vocabulary for speculation
+    :param norm_before_residual: Apply hidden_norm before storing residual
+    """
+    speculators_model_type: Literal["eagle3"] = "eagle3"
+    architectures: list[str] = Field(
+        default_factory=lambda: ["Eagle3Speculator"],
+        description="Model architectures that can load these weights",
+    )
+    transformer_layer_config: PretrainedConfig = Field(
+        default_factory=LlamaConfig,
+        description="Configuration for the transformer decoder layer",
+    )
+    draft_vocab_size: int = Field(
+        default=32000,
+        description="Size of draft model vocabulary for speculation",
+    )
+    norm_before_residual: bool = Field(
+        default=False,
+        description="Apply hidden_norm before storing residual",
+    )
+    target_hidden_size: int | None = Field(
+        default=None,
+        description="Hidden size of the target model (if different from draft model)",
+    )
+    eagle_aux_hidden_state_layer_ids: list[int] | None = Field(
+        default=None,
+        description="Layer IDs of the Eagle auxiliary hidden state layers",
+    )
+    @property
+    def target_vocab_size(self) -> int:
+        """Get target vocabulary size from transformer config."""
+        return self.transformer_layer_config.vocab_size
+    @field_serializer("transformer_layer_config")
+    def serialize_transformer_config(self, value: PretrainedConfig) -> dict:
+        """Serialize transformer config to dict."""
+        return value.to_diff_dict()
+    @field_validator("transformer_layer_config", mode="before")
+    @classmethod
+    def validate_transformer_config(cls, value: Any) -> PretrainedConfig:
+        """Validate and convert transformer config."""
+        if isinstance(value, dict):
+            config_class: type[PretrainedConfig] = LlamaConfig
+            if "model_type" in value:
+                config_class = AutoConfig.for_model(
+                    model_type=value["model_type"]
+                ).__class__
+            return config_class(**value)
+        return value

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.57.1"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5c4263ccf15362c2efe756ac964c2bad9cae2ca99397604af6b895a967e5203
+size 2390403048