amirali1985 commited on
Commit
4803667
·
verified ·
1 Parent(s): c4845fd

Upload add_sub_sorl_abs16_10K

Browse files
add_sub_sorl_abs16_10K/config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SorlModelWrapper"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": null,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 512,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention"
19
+ ],
20
+ "max_position_embeddings": 128,
21
+ "max_window_layers": 28,
22
+ "model_type": "qwen3",
23
+ "num_attention_heads": 4,
24
+ "num_hidden_layers": 3,
25
+ "num_key_value_heads": 4,
26
+ "pad_token_id": null,
27
+ "rms_norm_eps": 1e-06,
28
+ "rope_parameters": {
29
+ "rope_theta": 10000.0,
30
+ "rope_type": "default"
31
+ },
32
+ "sliding_window": null,
33
+ "tie_word_embeddings": false,
34
+ "transformers_version": "5.5.0",
35
+ "use_cache": true,
36
+ "use_sliding_window": false,
37
+ "vocab_size": 151660
38
+ }
add_sub_sorl_abs16_10K/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "output_attentions": false,
4
+ "output_hidden_states": false,
5
+ "transformers_version": "5.5.0",
6
+ "use_cache": true
7
+ }
add_sub_sorl_abs16_10K/metrics.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "history": {
3
+ "step": [
4
+ 50,
5
+ 100,
6
+ 150,
7
+ 207,
8
+ 257,
9
+ 307,
10
+ 364,
11
+ 414,
12
+ 464
13
+ ],
14
+ "loss": [
15
+ 6.49921178817749,
16
+ 2.8570327758789062,
17
+ 1.9880883693695068,
18
+ 1.7676962614059448,
19
+ 1.7946815490722656,
20
+ 1.7100752592086792,
21
+ 1.6409674882888794,
22
+ 1.3928353786468506,
23
+ 1.3056926727294922
24
+ ],
25
+ "base_loss": [
26
+ 6.476277828216553,
27
+ 2.875317096710205,
28
+ 2.0492959022521973,
29
+ 1.9857605695724487,
30
+ 2.1322402954101562,
31
+ 2.219243288040161,
32
+ 2.526505708694458,
33
+ 2.8985116481781006,
34
+ 2.8950657844543457
35
+ ],
36
+ "traj_loss": [
37
+ 6.49921178817749,
38
+ 2.8570327758789062,
39
+ 1.9880883693695068,
40
+ 1.7676962614059448,
41
+ 1.7946815490722656,
42
+ 1.7100752592086792,
43
+ 1.6409674882888794,
44
+ 1.3928353786468506,
45
+ 1.3056926727294922
46
+ ],
47
+ "hinge_loss": [
48
+ 0.0,
49
+ 0.0,
50
+ 0.0,
51
+ 0.0,
52
+ 0.0,
53
+ 0.0,
54
+ 0.0,
55
+ 0.0,
56
+ 0.0
57
+ ],
58
+ "masked_traj_loss": [
59
+ 0.0,
60
+ 0.0,
61
+ 0.0,
62
+ 0.0,
63
+ 0.0,
64
+ 0.0,
65
+ 0.0,
66
+ 0.0,
67
+ 0.0
68
+ ],
69
+ "abs_loss": [
70
+ 0.0,
71
+ 0.0,
72
+ 0.0,
73
+ 0.0,
74
+ 0.0,
75
+ 0.0,
76
+ 0.0,
77
+ 0.0,
78
+ 0.0
79
+ ],
80
+ "zipf_loss": [
81
+ 0.0,
82
+ 0.0,
83
+ 0.0,
84
+ 0.0,
85
+ 0.0,
86
+ 0.0,
87
+ 0.0,
88
+ 0.0,
89
+ 0.0
90
+ ],
91
+ "ortho_loss": [
92
+ 0.0,
93
+ 0.0,
94
+ 0.0,
95
+ 0.0,
96
+ 0.0,
97
+ 0.0,
98
+ 0.0,
99
+ 0.0,
100
+ 0.0
101
+ ],
102
+ "anchor_loss": [
103
+ 0.0,
104
+ 0.0,
105
+ 0.0,
106
+ 0.0,
107
+ 0.0,
108
+ 0.0,
109
+ 0.0,
110
+ 0.0,
111
+ 0.0
112
+ ],
113
+ "jacobi_loss": [
114
+ 0.0,
115
+ 0.0,
116
+ 0.0,
117
+ 0.0,
118
+ 0.0,
119
+ 0.0,
120
+ 0.0,
121
+ 0.0,
122
+ 0.0
123
+ ],
124
+ "lr": [
125
+ 7.840000000000001e-05,
126
+ 8e-05,
127
+ 8e-05,
128
+ 8e-05,
129
+ 8e-05,
130
+ 7.854631828978623e-05,
131
+ 5.4175771971496434e-05,
132
+ 3.279809976247031e-05,
133
+ 1.1420427553444172e-05
134
+ ]
135
+ },
136
+ "final_accuracy": 0.0
137
+ }
add_sub_sorl_abs16_10K/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81393c84db2e4daba7c10b0f19734e733235073619f0b450c5c57d515c313b72
3
+ size 671856320
add_sub_sorl_abs16_10K/train_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "sorl",
3
+ "ops": "add_sub",
4
+ "n_digits": 6,
5
+ "n_layer": 3,
6
+ "n_head": 4,
7
+ "n_embd": 512,
8
+ "abs_vocab": 16,
9
+ "K": 4,
10
+ "batch_size": 64,
11
+ "num_epochs": 3,
12
+ "dataset_size": 10000,
13
+ "lr": 8e-05,
14
+ "output_dir": "ckpt/r/add_sub_sorl_abs16_10K",
15
+ "device": "cuda",
16
+ "push_to_hub": true,
17
+ "no_wandb": false,
18
+ "n_params": 167887104,
19
+ "run_name": "add_sub_sorl_abs16_10K",
20
+ "git_commit": "9e4530548a98f8c7f5c14930ac4aec4886bb4b1b",
21
+ "timestamp": "2026-04-07T12:28:13.481761",
22
+ "tokenizer": "Qwen/Qwen3-0.6B",
23
+ "dataset_repo": "thoughtworks/arithmetic-sorl-data",
24
+ "dataset_config": "add_sub_6digit",
25
+ "model_repo": "thoughtworks/arithmetic-sorl",
26
+ "trainer_version": "v6",
27
+ "final_accuracy": 0.0
28
+ }