alexmarques commited on
Commit
1fac655
·
verified ·
1 Parent(s): 0262f96

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +129 -0
  2. config.json +57 -0
  3. config.py +79 -0
  4. generation_config.json +4 -0
  5. model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model:
4
+ - Qwen/Qwen3-235B-A22B
5
+ tags:
6
+ - redhat
7
+ - neuralmagic
8
+ - qwen
9
+ - speculators
10
+ - eagle3
11
+ ---
12
+
13
+ # Qwen30235B-A22B-speculator
14
+
15
+ ## Model Overview
16
+ - **Verifier:** Qwen/Qwen3-235B-A22B
17
+ - **Speculative Decoding Algorithm:** EAGLE-3
18
+ - **Model Architecture:** Eagle3Speculator
19
+ - **Release Date:** 01/15/2026
20
+ - **Version:** 1.0
21
+ - **Model Developers:** RedHat
22
+
23
+ This is a speculator model designed for use with [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B), based on the [EAGLE-3](https://arxiv.org/abs/2503.01840) speculative decoding algorithm.
24
+ It was trained using the [speculators](https://github.com/neuralmagic/speculators) library on a combination of the [Magpie-Align/Magpie-Llama-3.1-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Llama-3.1-Pro-300K-Filtered) dataset and the `train_sft` split of the [HuggingFaceH4/ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) dataset.
25
+ This model should be used with the [Qwen/Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) chat template, specifically through the `/chat/completions` endpoint. It was trained with thinking model on.
26
+
27
+ ## Use with vLLM
28
+
29
+ ```bash
30
+ vllm serve Qwen/Qwen3-235B-A22B \
31
+ -tp 8 \
32
+ --speculative-config '{
33
+ "model": "RedHatAI/Qwen3-235B-A22B-speculator.eagle3",
34
+ "num_speculative_tokens": 3,
35
+ "method": "eagle3"
36
+ }'
37
+ ```
38
+
39
+ ## Evaluations
40
+
41
+ <h3>Use cases</h3>
42
+ <table>
43
+ <thead>
44
+ <tr>
45
+ <th>Use Case</th>
46
+ <th>Dataset</th>
47
+ <th>Number of Samples</th>
48
+ </tr>
49
+ </thead>
50
+ <tbody>
51
+ <tr>
52
+ <td>Coding</td>
53
+ <td>HumanEval</td>
54
+ <td>168</td>
55
+ </tr>
56
+ <tr>
57
+ <td>Math Reasoning</td>
58
+ <td>gsm8k</td>
59
+ <td>80</td>
60
+ </tr>
61
+ <tr>
62
+ <td>Text Summarization</td>
63
+ <td>CNN/Daily Mail</td>
64
+ <td>80</td>
65
+ </tr>
66
+ </tbody>
67
+ </table>
68
+
69
+ <h3>Acceptance lengths</h3>
70
+ <table>
71
+ <thead>
72
+ <tr>
73
+ <th>Use Case</th>
74
+ <th>k=1</th>
75
+ <th>k=2</th>
76
+ <th>k=3</th>
77
+ <th>k=4</th>
78
+ <th>k=5</th>
79
+ </tr>
80
+ </thead>
81
+ <tbody>
82
+ <tr>
83
+ <td>Coding</td>
84
+ <td>1.77</td>
85
+ <td>2.19</td>
86
+ <td>2.51</td>
87
+ <td>2.72</td>
88
+ <td>2.83</td>
89
+ </tr>
90
+ <tr>
91
+ <td>Math Reasoning</td>
92
+ <td>1.77</td>
93
+ <td>2.33</td>
94
+ <td>2.73</td>
95
+ <td>3.03</td>
96
+ <td>3.24</td>
97
+ </tr>
98
+ <tr>
99
+ <td>Text Summarization</td>
100
+ <td>1.63</td>
101
+ <td>2.00</td>
102
+ <td>2.22</td>
103
+ <td>2.34</td>
104
+ <td>2.40</td>
105
+ </tr>
106
+ </tbody>
107
+ </table>
108
+
109
+ <details> <summary>Details</summary>
110
+ <strong>Configuration</strong>
111
+
112
+ - repetitions: 1
113
+ - time per experiment: 10min
114
+ - hardware: 8xA100
115
+ - vLLM version: 0.11.2
116
+ - GuideLLM version: 0.3.0
117
+
118
+ <strong>Command</strong>
119
+ ```bash
120
+ GUIDELLM__PREFERRED_ROUTE="chat_completions" \
121
+ guidellm benchmark \
122
+ --target "http://localhost:8000/v1" \
123
+ --data "RedHatAI/speculator_benchmarks" \
124
+ --data-args '{"data_files": "HumanEval.jsonl"}' \
125
+ --rate-type sweep \
126
+ --max-seconds 600 \
127
+ --output-path "Qwen235B-HumanEval.json" \
128
+
129
+ </details>
config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Eagle3DraftModel"
4
+ ],
5
+ "auto_map": {
6
+ "": "config.Eagle3SpeculatorConfig"
7
+ },
8
+ "base_model_ep_plan": null,
9
+ "draft_vocab_size": 64000,
10
+ "dtype": "float32",
11
+ "eagle_aux_hidden_state_layer_ids": null,
12
+ "has_no_defaults_at_init": false,
13
+ "norm_before_residual": true,
14
+ "speculators_config": {
15
+ "algorithm": "eagle3",
16
+ "default_proposal_method": "greedy",
17
+ "proposal_methods": [
18
+ {
19
+ "accept_tolerance": 0.0,
20
+ "proposal_type": "greedy",
21
+ "speculative_tokens": 3,
22
+ "verifier_accept_k": 1
23
+ }
24
+ ],
25
+ "verifier": {
26
+ "architectures": [
27
+ "LlamaForCausalLM"
28
+ ],
29
+ "name_or_path": "Qwen/Qwen3-235B-A22B"
30
+ }
31
+ },
32
+ "speculators_model_type": "eagle3",
33
+ "speculators_version": "0.4.0.dev12",
34
+ "target_hidden_size": null,
35
+ "transformer_layer_config": {
36
+ "attention_bias": false,
37
+ "attention_dropout": 0.0,
38
+ "head_dim": 128,
39
+ "hidden_act": "silu",
40
+ "hidden_size": 4096,
41
+ "initializer_range": 0.02,
42
+ "intermediate_size": 12288,
43
+ "max_position_embeddings": 40960,
44
+ "mlp_bias": false,
45
+ "model_type": "llama",
46
+ "num_attention_heads": 64,
47
+ "num_hidden_layers": 1,
48
+ "num_key_value_heads": 4,
49
+ "pretraining_tp": 1,
50
+ "rms_norm_eps": 1e-06,
51
+ "rope_scaling": null,
52
+ "rope_theta": 10000.0,
53
+ "use_cache": true,
54
+ "vocab_size": 151936
55
+ },
56
+ "transformers_version": "4.57.1"
57
+ }
config.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Literal
2
+
3
+ from pydantic import Field, field_serializer, field_validator
4
+ from transformers import AutoConfig, PretrainedConfig
5
+ from transformers.models.llama.configuration_llama import LlamaConfig
6
+
7
+ from speculators import SpeculatorModelConfig
8
+
9
+ __all__ = [
10
+ "Eagle3SpeculatorConfig",
11
+ ]
12
+
13
+
14
+ @SpeculatorModelConfig.register("eagle3")
15
+ class Eagle3SpeculatorConfig(SpeculatorModelConfig):
16
+ """
17
+ Configuration for EAGLE-3 speculator with vocabulary mapping.
18
+
19
+ EAGLE-3 features vocabulary mapping between draft (32K) and target (128K)
20
+ vocabularies, enabling cross-tokenizer speculation.
21
+
22
+ :param transformer_layer_config: Configuration for the transformer decoder layer
23
+ :param draft_vocab_size: Size of draft model vocabulary for speculation
24
+ :param norm_before_residual: Apply hidden_norm before storing residual
25
+ """
26
+
27
+ speculators_model_type: Literal["eagle3"] = "eagle3"
28
+ architectures: list[str] = Field(
29
+ default_factory=lambda: ["Eagle3Speculator"],
30
+ description="Model architectures that can load these weights",
31
+ )
32
+
33
+ transformer_layer_config: PretrainedConfig = Field(
34
+ default_factory=LlamaConfig,
35
+ description="Configuration for the transformer decoder layer",
36
+ )
37
+
38
+ draft_vocab_size: int = Field(
39
+ default=32000,
40
+ description="Size of draft model vocabulary for speculation",
41
+ )
42
+
43
+ norm_before_residual: bool = Field(
44
+ default=False,
45
+ description="Apply hidden_norm before storing residual",
46
+ )
47
+
48
+ target_hidden_size: int | None = Field(
49
+ default=None,
50
+ description="Hidden size of the target model (if different from draft model)",
51
+ )
52
+
53
+ eagle_aux_hidden_state_layer_ids: list[int] | None = Field(
54
+ default=None,
55
+ description="Layer IDs of the Eagle auxiliary hidden state layers",
56
+ )
57
+
58
+ @property
59
+ def target_vocab_size(self) -> int:
60
+ """Get target vocabulary size from transformer config."""
61
+ return self.transformer_layer_config.vocab_size
62
+
63
+ @field_serializer("transformer_layer_config")
64
+ def serialize_transformer_config(self, value: PretrainedConfig) -> dict:
65
+ """Serialize transformer config to dict."""
66
+ return value.to_diff_dict()
67
+
68
+ @field_validator("transformer_layer_config", mode="before")
69
+ @classmethod
70
+ def validate_transformer_config(cls, value: Any) -> PretrainedConfig:
71
+ """Validate and convert transformer config."""
72
+ if isinstance(value, dict):
73
+ config_class: type[PretrainedConfig] = LlamaConfig
74
+ if "model_type" in value:
75
+ config_class = AutoConfig.for_model(
76
+ model_type=value["model_type"]
77
+ ).__class__
78
+ return config_class(**value)
79
+ return value
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.57.1"
4
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5c4263ccf15362c2efe756ac964c2bad9cae2ca99397604af6b895a967e5203
3
+ size 2390403048