bezzam HF Staff commited on
Commit
ee453bd
·
verified ·
1 Parent(s): 9f08634

Upload VibeVoiceForConditionalGeneration

Browse files
config.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "acoustic_tokenizer_config": {
3
+ "channels": 1,
4
+ "depths": [
5
+ 3,
6
+ 3,
7
+ 3,
8
+ 3,
9
+ 3,
10
+ 3,
11
+ 8
12
+ ],
13
+ "downsampling_ratios": [
14
+ 2,
15
+ 2,
16
+ 4,
17
+ 5,
18
+ 5,
19
+ 8
20
+ ],
21
+ "ffn_expansion": 4,
22
+ "hidden_act": "gelu",
23
+ "hidden_size": 64,
24
+ "initializer_range": 0.01,
25
+ "kernel_size": 7,
26
+ "layer_scale_init_value": 1e-06,
27
+ "model_type": "vibevoice_acoustic_tokenizer",
28
+ "num_filters": 32,
29
+ "rms_norm_eps": 1e-05,
30
+ "vae_std": 0.625,
31
+ "weight_init_value": 0.01
32
+ },
33
+ "architectures": [
34
+ "VibeVoiceForConditionalGeneration"
35
+ ],
36
+ "audio_bos_token_id": 151652,
37
+ "audio_diffusion_token_id": 151654,
38
+ "audio_eos_token_id": 151653,
39
+ "ddpm_beta_schedule": "squaredcos_cap_v2",
40
+ "ddpm_num_inference_steps": 20,
41
+ "ddpm_num_steps": 1000,
42
+ "dtype": "bfloat16",
43
+ "eos_token_id": 151643,
44
+ "frequency_embedding_size": 256,
45
+ "hidden_act": "silu",
46
+ "intermediate_size": 4608,
47
+ "mlp_bias": false,
48
+ "model_type": "vibevoice",
49
+ "num_head_layers": 4,
50
+ "pad_token_id": 151643,
51
+ "prediction_type": "v_prediction",
52
+ "rms_norm_eps": 1e-05,
53
+ "semantic_tokenizer_config": {
54
+ "channels": 1,
55
+ "depths": [
56
+ 3,
57
+ 3,
58
+ 3,
59
+ 3,
60
+ 3,
61
+ 3,
62
+ 8
63
+ ],
64
+ "downsampling_ratios": [
65
+ 2,
66
+ 2,
67
+ 4,
68
+ 5,
69
+ 5,
70
+ 8
71
+ ],
72
+ "ffn_expansion": 4,
73
+ "hidden_act": "gelu",
74
+ "hidden_size": 128,
75
+ "kernel_size": 7,
76
+ "layer_scale_init_value": 1e-06,
77
+ "model_type": "vibevoice_semantic_tokenizer",
78
+ "num_filters": 32,
79
+ "rms_norm_eps": 1e-05,
80
+ "weight_init_value": 0.01
81
+ },
82
+ "text_config": {
83
+ "attention_dropout": 0.0,
84
+ "bos_token_id": null,
85
+ "dtype": "bfloat16",
86
+ "eos_token_id": null,
87
+ "hidden_act": "silu",
88
+ "hidden_size": 1536,
89
+ "initializer_range": 0.02,
90
+ "intermediate_size": 8960,
91
+ "layer_types": [
92
+ "full_attention",
93
+ "full_attention",
94
+ "full_attention",
95
+ "full_attention",
96
+ "full_attention",
97
+ "full_attention",
98
+ "full_attention",
99
+ "full_attention",
100
+ "full_attention",
101
+ "full_attention",
102
+ "full_attention",
103
+ "full_attention",
104
+ "full_attention",
105
+ "full_attention",
106
+ "full_attention",
107
+ "full_attention",
108
+ "full_attention",
109
+ "full_attention",
110
+ "full_attention",
111
+ "full_attention",
112
+ "full_attention",
113
+ "full_attention",
114
+ "full_attention",
115
+ "full_attention",
116
+ "full_attention",
117
+ "full_attention",
118
+ "full_attention",
119
+ "full_attention"
120
+ ],
121
+ "max_position_embeddings": 65536,
122
+ "max_window_layers": 28,
123
+ "model_type": "qwen2",
124
+ "num_attention_heads": 12,
125
+ "num_hidden_layers": 28,
126
+ "num_key_value_heads": 2,
127
+ "pad_token_id": null,
128
+ "rms_norm_eps": 1e-06,
129
+ "rope_parameters": {
130
+ "rope_theta": 1000000.0,
131
+ "rope_type": "default"
132
+ },
133
+ "sliding_window": null,
134
+ "tie_word_embeddings": true,
135
+ "use_cache": true,
136
+ "use_sliding_window": false,
137
+ "vocab_size": 151936
138
+ },
139
+ "tie_word_embeddings": true,
140
+ "transformers_version": "5.2.0.dev0",
141
+ "vocab_size": 151936
142
+ }
generation_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": false,
3
+ "audio_bos_token_id": 151652,
4
+ "audio_diffusion_id": 151654,
5
+ "audio_eos_token_id": 151653,
6
+ "cfg_scale": 1.3,
7
+ "do_sample": false,
8
+ "eos_token_id": 151643,
9
+ "max_length": 40500,
10
+ "max_new_tokens": 40500,
11
+ "n_diffusion_steps": 10,
12
+ "noise_scheduler_class": "DPMSolverMultistepScheduler",
13
+ "noise_scheduler_config": {
14
+ "beta_schedule": "squaredcos_cap_v2",
15
+ "num_train_timesteps": 1000,
16
+ "prediction_type": "v_prediction"
17
+ },
18
+ "output_attentions": false,
19
+ "output_hidden_states": false,
20
+ "pad_token_id": 151643,
21
+ "transformers_version": "5.2.0.dev0",
22
+ "use_cache": true
23
+ }
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35417fbbaf0da7abf4402902397d82ef43584b7237951578dbf3aa467695dbb2
3
+ size 1975317780
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc8666118eb9d4e4b6e9eeae747bed7e8639a768e797dd703b52d2055364a308
3
+ size 1969372304
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d35690915a1bae48221d757be1c2e41e3f2e395c35cacacf35279203f053c1c
3
+ size 1463517674
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff