amirali1985 commited on
Commit
9f04ca8
·
verified ·
1 Parent(s): 821c2b0

Upload add_sub_sorl_abs16_25K

Browse files
add_sub_sorl_abs16_25K/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention"
19
+ ],
20
+ "max_position_embeddings": 128,
21
+ "max_window_layers": 28,
22
+ "model_type": "qwen3",
23
+ "num_attention_heads": 4,
24
+ "num_hidden_layers": 3,
25
+ "num_key_value_heads": 4,
26
+ "pad_token_id": null,
27
+ "rms_norm_eps": 1e-06,
28
+ "rope_parameters": {
29
+ "rope_theta": 10000.0,
30
+ "rope_type": "default"
31
+ },
32
+ "sliding_window": null,
33
+ "tie_word_embeddings": false,
34
+ "transformers_version": "5.5.0",
35
+ "use_cache": true,
36
+ "use_sliding_window": false,
37
+ "vocab_size": 151660
38
+ }
add_sub_sorl_abs16_25K/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_sorl_abs16_25K/metrics.json ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 200,
8
+ 250,
9
+ 300,
10
+ 350,
11
+ 441,
12
+ 491,
13
+ 541,
14
+ 591,
15
+ 641,
16
+ 691,
17
+ 741,
18
+ 832,
19
+ 882,
20
+ 932,
21
+ 982,
22
+ 1032,
23
+ 1082,
24
+ 1132
25
+ ],
26
+ "loss": [
27
+ 6.561774730682373,
28
+ 2.8691329956054688,
29
+ 1.9997152090072632,
30
+ 1.8546234369277954,
31
+ 1.8399007320404053,
32
+ 1.7809630632400513,
33
+ 1.676088571548462,
34
+ 1.3971723318099976,
35
+ 0.9440449476242065,
36
+ 0.565602719783783,
37
+ 0.28183868527412415,
38
+ 0.1783355474472046,
39
+ 0.1595521718263626,
40
+ 0.08057533949613571,
41
+ 0.06170733645558357,
42
+ 0.04374723881483078,
43
+ 0.03371792286634445,
44
+ 0.016453929245471954,
45
+ 0.009380219504237175,
46
+ 0.00803250353783369,
47
+ 0.004880059976130724
48
+ ],
49
+ "base_loss": [
50
+ 6.559733867645264,
51
+ 2.9012436866760254,
52
+ 2.0608508586883545,
53
+ 2.087801218032837,
54
+ 2.148261308670044,
55
+ 2.4102892875671387,
56
+ 2.3063976764678955,
57
+ 2.751943349838257,
58
+ 3.4612622261047363,
59
+ 3.8862569332122803,
60
+ 4.3692498207092285,
61
+ 4.927584171295166,
62
+ 5.282938480377197,
63
+ 5.3806915283203125,
64
+ 5.3356170654296875,
65
+ 5.71320104598999,
66
+ 5.819161415100098,
67
+ 6.070667743682861,
68
+ 6.114338397979736,
69
+ 6.250763893127441,
70
+ 6.165469646453857
71
+ ],
72
+ "traj_loss": [
73
+ 6.561774730682373,
74
+ 2.8691329956054688,
75
+ 1.9997152090072632,
76
+ 1.8546234369277954,
77
+ 1.8399007320404053,
78
+ 1.7809630632400513,
79
+ 1.676088571548462,
80
+ 1.3971723318099976,
81
+ 0.9440449476242065,
82
+ 0.565602719783783,
83
+ 0.28183868527412415,
84
+ 0.1783355474472046,
85
+ 0.1595521718263626,
86
+ 0.08057533949613571,
87
+ 0.06170733645558357,
88
+ 0.04374723881483078,
89
+ 0.03371792286634445,
90
+ 0.016453929245471954,
91
+ 0.009380219504237175,
92
+ 0.00803250353783369,
93
+ 0.004880059976130724
94
+ ],
95
+ "hinge_loss": [
96
+ 0.0,
97
+ 0.0,
98
+ 0.0,
99
+ 0.0,
100
+ 0.0,
101
+ 0.0,
102
+ 0.0,
103
+ 0.0,
104
+ 0.0,
105
+ 0.0,
106
+ 0.0,
107
+ 0.0,
108
+ 0.0,
109
+ 0.0,
110
+ 0.0,
111
+ 0.0,
112
+ 0.0,
113
+ 0.0,
114
+ 0.0,
115
+ 0.0,
116
+ 0.0
117
+ ],
118
+ "masked_traj_loss": [
119
+ 0.0,
120
+ 0.0,
121
+ 0.0,
122
+ 0.0,
123
+ 0.0,
124
+ 0.0,
125
+ 0.0,
126
+ 0.0,
127
+ 0.0,
128
+ 0.0,
129
+ 0.0,
130
+ 0.0,
131
+ 0.0,
132
+ 0.0,
133
+ 0.0,
134
+ 0.0,
135
+ 0.0,
136
+ 0.0,
137
+ 0.0,
138
+ 0.0,
139
+ 0.0
140
+ ],
141
+ "abs_loss": [
142
+ 0.0,
143
+ 0.0,
144
+ 0.0,
145
+ 0.0,
146
+ 0.0,
147
+ 0.0,
148
+ 0.0,
149
+ 0.0,
150
+ 0.0,
151
+ 0.0,
152
+ 0.0,
153
+ 0.0,
154
+ 0.0,
155
+ 0.0,
156
+ 0.0,
157
+ 0.0,
158
+ 0.0,
159
+ 0.0,
160
+ 0.0,
161
+ 0.0,
162
+ 0.0
163
+ ],
164
+ "zipf_loss": [
165
+ 0.0,
166
+ 0.0,
167
+ 0.0,
168
+ 0.0,
169
+ 0.0,
170
+ 0.0,
171
+ 0.0,
172
+ 0.0,
173
+ 0.0,
174
+ 0.0,
175
+ 0.0,
176
+ 0.0,
177
+ 0.0,
178
+ 0.0,
179
+ 0.0,
180
+ 0.0,
181
+ 0.0,
182
+ 0.0,
183
+ 0.0,
184
+ 0.0,
185
+ 0.0
186
+ ],
187
+ "ortho_loss": [
188
+ 0.0,
189
+ 0.0,
190
+ 0.0,
191
+ 0.0,
192
+ 0.0,
193
+ 0.0,
194
+ 0.0,
195
+ 0.0,
196
+ 0.0,
197
+ 0.0,
198
+ 0.0,
199
+ 0.0,
200
+ 0.0,
201
+ 0.0,
202
+ 0.0,
203
+ 0.0,
204
+ 0.0,
205
+ 0.0,
206
+ 0.0,
207
+ 0.0,
208
+ 0.0
209
+ ],
210
+ "anchor_loss": [
211
+ 0.0,
212
+ 0.0,
213
+ 0.0,
214
+ 0.0,
215
+ 0.0,
216
+ 0.0,
217
+ 0.0,
218
+ 0.0,
219
+ 0.0,
220
+ 0.0,
221
+ 0.0,
222
+ 0.0,
223
+ 0.0,
224
+ 0.0,
225
+ 0.0,
226
+ 0.0,
227
+ 0.0,
228
+ 0.0,
229
+ 0.0,
230
+ 0.0,
231
+ 0.0
232
+ ],
233
+ "jacobi_loss": [
234
+ 0.0,
235
+ 0.0,
236
+ 0.0,
237
+ 0.0,
238
+ 0.0,
239
+ 0.0,
240
+ 0.0,
241
+ 0.0,
242
+ 0.0,
243
+ 0.0,
244
+ 0.0,
245
+ 0.0,
246
+ 0.0,
247
+ 0.0,
248
+ 0.0,
249
+ 0.0,
250
+ 0.0,
251
+ 0.0,
252
+ 0.0,
253
+ 0.0,
254
+ 0.0
255
+ ],
256
+ "lr": [
257
+ 7.840000000000001e-05,
258
+ 8e-05,
259
+ 8e-05,
260
+ 8e-05,
261
+ 8e-05,
262
+ 8e-05,
263
+ 8e-05,
264
+ 8e-05,
265
+ 8e-05,
266
+ 8e-05,
267
+ 8e-05,
268
+ 8e-05,
269
+ 8e-05,
270
+ 7.740338379341051e-05,
271
+ 6.281745325022262e-05,
272
+ 5.480320569902048e-05,
273
+ 4.678895814781834e-05,
274
+ 3.8774710596616195e-05,
275
+ 3.076046304541408e-05,
276
+ 2.2746215494211937e-05,
277
+ 1.4731967943009797e-05
278
+ ]
279
+ },
280
+ "final_accuracy": 0.0
281
+ }
add_sub_sorl_abs16_25K/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f00d4a75cbe2471cf1aabaf8a690cd7c1523754fb866293ad51ec72101a8b576
3
+ size 671856320
add_sub_sorl_abs16_25K/train_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "sorl",
3
+ "ops": "add_sub",
4
+ "n_digits": 6,
5
+ "n_layer": 3,
6
+ "n_head": 4,
7
+ "n_embd": 512,
8
+ "abs_vocab": 16,
9
+ "K": 4,
10
+ "batch_size": 64,
11
+ "num_epochs": 3,
12
+ "dataset_size": 25000,
13
+ "lr": 8e-05,
14
+ "output_dir": "ckpt/r/add_sub_sorl_abs16_25K",
15
+ "device": "cuda",
16
+ "push_to_hub": true,
17
+ "no_wandb": false,
18
+ "n_params": 167887104,
19
+ "run_name": "add_sub_sorl_abs16_25K",
20
+ "git_commit": "9e4530548a98f8c7f5c14930ac4aec4886bb4b1b",
21
+ "timestamp": "2026-04-07T11:08:48.266576",
22
+ "tokenizer": "Qwen/Qwen3-0.6B",
23
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
24
+ "dataset_config": "add_sub_6digit",
25
+ "model_repo": "thoughtworks/arithmetic-sorl",
26
+ "trainer_version": "v6",
27
+ "final_accuracy": 0.0
28
+ }