BFS-Search commited on
Commit
9321fb7
·
verified ·
1 Parent(s): 2e80773

Upload model files

Browse files
Files changed (1) hide show
  1. checkpoint-50/trainer_state.json +1084 -0
checkpoint-50/trainer_state.json ADDED
@@ -0,0 +1,1084 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "episode": 1200,
5
+ "epoch": 0.17142857142857143,
6
+ "eval_steps": 500,
7
+ "global_step": 50,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "episode": 24,
14
+ "epoch": 0.0034285714285714284,
15
+ "eps": 3,
16
+ "loss/policy_avg": -0.00039759615901857615,
17
+ "loss/value_avg": 0.001173102529719472,
18
+ "lr": 0.0,
19
+ "objective/entropy": 9.603790283203125,
20
+ "objective/kl": 0.0522441528737545,
21
+ "objective/non_score_reward": -0.0026122077833861113,
22
+ "objective/rlhf_reward": -0.0026122077833861113,
23
+ "objective/scores": 0.0,
24
+ "policy/approxkl_avg": 0.00023563228023704141,
25
+ "policy/clipfrac_avg": 0.00039308174746111035,
26
+ "policy/entropy_avg": 0.17652443051338196,
27
+ "step": 1,
28
+ "val/clipfrac_avg": 0.0,
29
+ "val/num_eos_tokens": 0,
30
+ "val/ratio": 0.9997509717941284,
31
+ "val/ratio_var": 1.331204543930653e-06
32
+ },
33
+ {
34
+ "episode": 48,
35
+ "epoch": 0.006857142857142857,
36
+ "eps": 2,
37
+ "loss/policy_avg": 9.456602856516838e-05,
38
+ "loss/value_avg": 0.0012460823636502028,
39
+ "lr": 3.3333333333333334e-08,
40
+ "objective/entropy": 11.654745101928711,
41
+ "objective/kl": 0.04007173329591751,
42
+ "objective/non_score_reward": -0.002003586385399103,
43
+ "objective/rlhf_reward": -0.002003586385399103,
44
+ "objective/scores": 0.0,
45
+ "policy/approxkl_avg": 0.0005037242081016302,
46
+ "policy/clipfrac_avg": 0.000589622650295496,
47
+ "policy/entropy_avg": 0.22399930655956268,
48
+ "step": 2,
49
+ "val/clipfrac_avg": 0.0,
50
+ "val/num_eos_tokens": 0,
51
+ "val/ratio": 0.999730110168457,
52
+ "val/ratio_var": 1.496027721259452e-06
53
+ },
54
+ {
55
+ "episode": 72,
56
+ "epoch": 0.010285714285714285,
57
+ "eps": 3,
58
+ "loss/policy_avg": 0.0005133129889145494,
59
+ "loss/value_avg": 0.0011833460303023458,
60
+ "lr": 6.666666666666667e-08,
61
+ "objective/entropy": 9.299583435058594,
62
+ "objective/kl": -0.02202693186700344,
63
+ "objective/non_score_reward": 0.0011013468028977513,
64
+ "objective/rlhf_reward": 0.0011013468028977513,
65
+ "objective/scores": 0.0,
66
+ "policy/approxkl_avg": 0.0002756627509370446,
67
+ "policy/clipfrac_avg": 0.00019654087373055518,
68
+ "policy/entropy_avg": 0.16759081184864044,
69
+ "step": 3,
70
+ "val/clipfrac_avg": 0.0,
71
+ "val/num_eos_tokens": 0,
72
+ "val/ratio": 1.0005955696105957,
73
+ "val/ratio_var": 1.5924795206956333e-06
74
+ },
75
+ {
76
+ "episode": 96,
77
+ "epoch": 0.013714285714285714,
78
+ "eps": 3,
79
+ "loss/policy_avg": -0.0007660436676815152,
80
+ "loss/value_avg": 0.0012440377613529563,
81
+ "lr": 1e-07,
82
+ "objective/entropy": 10.092122077941895,
83
+ "objective/kl": -0.001453061937354505,
84
+ "objective/non_score_reward": 7.26534053683281e-05,
85
+ "objective/rlhf_reward": 7.26534053683281e-05,
86
+ "objective/scores": 0.0,
87
+ "policy/approxkl_avg": 0.0004117934440728277,
88
+ "policy/clipfrac_avg": 0.001179245300590992,
89
+ "policy/entropy_avg": 0.18802356719970703,
90
+ "step": 4,
91
+ "val/clipfrac_avg": 0.0,
92
+ "val/num_eos_tokens": 0,
93
+ "val/ratio": 1.0004326105117798,
94
+ "val/ratio_var": 2.240923549834406e-06
95
+ },
96
+ {
97
+ "episode": 120,
98
+ "epoch": 0.017142857142857144,
99
+ "eps": 3,
100
+ "loss/policy_avg": -0.0014500913675874472,
101
+ "loss/value_avg": 0.001122939051128924,
102
+ "lr": 1.3333333333333334e-07,
103
+ "objective/entropy": 10.164737701416016,
104
+ "objective/kl": 0.05467784032225609,
105
+ "objective/non_score_reward": -0.002733892295509577,
106
+ "objective/rlhf_reward": -0.002733892295509577,
107
+ "objective/scores": 0.0,
108
+ "policy/approxkl_avg": 0.0004754411638714373,
109
+ "policy/clipfrac_avg": 0.002358490601181984,
110
+ "policy/entropy_avg": 0.18593449890613556,
111
+ "step": 5,
112
+ "val/clipfrac_avg": 0.0,
113
+ "val/num_eos_tokens": 0,
114
+ "val/ratio": 1.0011851787567139,
115
+ "val/ratio_var": 2.860480663002818e-06
116
+ },
117
+ {
118
+ "episode": 144,
119
+ "epoch": 0.02057142857142857,
120
+ "eps": 3,
121
+ "loss/policy_avg": -0.00038924324326217175,
122
+ "loss/value_avg": 0.0011501125991344452,
123
+ "lr": 1.6666666666666665e-07,
124
+ "objective/entropy": 10.74693489074707,
125
+ "objective/kl": -0.019970744848251343,
126
+ "objective/non_score_reward": 0.0009985374053940177,
127
+ "objective/rlhf_reward": 0.0009985374053940177,
128
+ "objective/scores": 0.0,
129
+ "policy/approxkl_avg": 0.0004121576203033328,
130
+ "policy/clipfrac_avg": 0.0013757861452177167,
131
+ "policy/entropy_avg": 0.20963406562805176,
132
+ "step": 6,
133
+ "val/clipfrac_avg": 0.0,
134
+ "val/num_eos_tokens": 0,
135
+ "val/ratio": 1.0004911422729492,
136
+ "val/ratio_var": 3.83469341613818e-06
137
+ },
138
+ {
139
+ "episode": 168,
140
+ "epoch": 0.024,
141
+ "eps": 3,
142
+ "loss/policy_avg": -0.0006043941248208284,
143
+ "loss/value_avg": 0.0011014372576028109,
144
+ "lr": 2e-07,
145
+ "objective/entropy": 9.983142852783203,
146
+ "objective/kl": -0.023275122046470642,
147
+ "objective/non_score_reward": 0.0011637563584372401,
148
+ "objective/rlhf_reward": 0.0011637563584372401,
149
+ "objective/scores": 0.0,
150
+ "policy/approxkl_avg": 0.00038099882658571005,
151
+ "policy/clipfrac_avg": 0.00039308174746111035,
152
+ "policy/entropy_avg": 0.18453757464885712,
153
+ "step": 7,
154
+ "val/clipfrac_avg": 0.0,
155
+ "val/num_eos_tokens": 0,
156
+ "val/ratio": 0.998975396156311,
157
+ "val/ratio_var": 2.2136255211080424e-06
158
+ },
159
+ {
160
+ "episode": 192,
161
+ "epoch": 0.027428571428571427,
162
+ "eps": 3,
163
+ "loss/policy_avg": -0.0012414506636559963,
164
+ "loss/value_avg": 0.0011209447402507067,
165
+ "lr": 2.3333333333333333e-07,
166
+ "objective/entropy": 10.798426628112793,
167
+ "objective/kl": 0.0037710717879235744,
168
+ "objective/non_score_reward": -0.00018855356029234827,
169
+ "objective/rlhf_reward": -0.00018855356029234827,
170
+ "objective/scores": 0.0,
171
+ "policy/approxkl_avg": 0.00043036130955442786,
172
+ "policy/clipfrac_avg": 0.0021619496401399374,
173
+ "policy/entropy_avg": 0.1988888382911682,
174
+ "step": 8,
175
+ "val/clipfrac_avg": 0.0,
176
+ "val/num_eos_tokens": 0,
177
+ "val/ratio": 0.999695360660553,
178
+ "val/ratio_var": 1.6811103478175937e-06
179
+ },
180
+ {
181
+ "episode": 216,
182
+ "epoch": 0.030857142857142857,
183
+ "eps": 3,
184
+ "loss/policy_avg": -0.0001550057204440236,
185
+ "loss/value_avg": 0.0011778806801885366,
186
+ "lr": 2.6666666666666667e-07,
187
+ "objective/entropy": 11.547513961791992,
188
+ "objective/kl": -0.006700088735669851,
189
+ "objective/non_score_reward": 0.00033500464633107185,
190
+ "objective/rlhf_reward": 0.00033500464633107185,
191
+ "objective/scores": 0.0,
192
+ "policy/approxkl_avg": 0.0003637151967268437,
193
+ "policy/clipfrac_avg": 0.00039308174746111035,
194
+ "policy/entropy_avg": 0.20382140576839447,
195
+ "step": 9,
196
+ "val/clipfrac_avg": 0.0,
197
+ "val/num_eos_tokens": 0,
198
+ "val/ratio": 1.000375509262085,
199
+ "val/ratio_var": 2.181258651035023e-06
200
+ },
201
+ {
202
+ "episode": 240,
203
+ "epoch": 0.03428571428571429,
204
+ "eps": 3,
205
+ "loss/policy_avg": 0.00045341846998780966,
206
+ "loss/value_avg": 0.0011125364108011127,
207
+ "lr": 3e-07,
208
+ "objective/entropy": 10.174510955810547,
209
+ "objective/kl": 0.07684259861707687,
210
+ "objective/non_score_reward": -0.0038421298377215862,
211
+ "objective/rlhf_reward": -0.0038421298377215862,
212
+ "objective/scores": 0.0,
213
+ "policy/approxkl_avg": 0.00037233877810649574,
214
+ "policy/clipfrac_avg": 0.00039308174746111035,
215
+ "policy/entropy_avg": 0.20036275684833527,
216
+ "step": 10,
217
+ "val/clipfrac_avg": 0.0,
218
+ "val/num_eos_tokens": 0,
219
+ "val/ratio": 1.0004992485046387,
220
+ "val/ratio_var": 3.829253728326876e-06
221
+ },
222
+ {
223
+ "episode": 264,
224
+ "epoch": 0.037714285714285714,
225
+ "eps": 4,
226
+ "loss/policy_avg": -0.0006368225440382957,
227
+ "loss/value_avg": 0.0012326778378337622,
228
+ "lr": 3.333333333333333e-07,
229
+ "objective/entropy": 9.915946006774902,
230
+ "objective/kl": 0.08295571058988571,
231
+ "objective/non_score_reward": -0.0041477857157588005,
232
+ "objective/rlhf_reward": -0.0041477857157588005,
233
+ "objective/scores": 0.0,
234
+ "policy/approxkl_avg": 0.0003957168955821544,
235
+ "policy/clipfrac_avg": 0.00019654087373055518,
236
+ "policy/entropy_avg": 0.18829384446144104,
237
+ "step": 11,
238
+ "val/clipfrac_avg": 0.0,
239
+ "val/num_eos_tokens": 0,
240
+ "val/ratio": 1.0000991821289062,
241
+ "val/ratio_var": 2.684702167243813e-06
242
+ },
243
+ {
244
+ "episode": 288,
245
+ "epoch": 0.04114285714285714,
246
+ "eps": 4,
247
+ "loss/policy_avg": -0.00045989686623215675,
248
+ "loss/value_avg": 0.001033363863825798,
249
+ "lr": 3.666666666666666e-07,
250
+ "objective/entropy": 9.914142608642578,
251
+ "objective/kl": 0.001713767647743225,
252
+ "objective/non_score_reward": -8.56887418194674e-05,
253
+ "objective/rlhf_reward": -8.56887418194674e-05,
254
+ "objective/scores": 0.0,
255
+ "policy/approxkl_avg": 0.00041497976053506136,
256
+ "policy/clipfrac_avg": 0.0013757861452177167,
257
+ "policy/entropy_avg": 0.20258067548274994,
258
+ "step": 12,
259
+ "val/clipfrac_avg": 0.0,
260
+ "val/num_eos_tokens": 0,
261
+ "val/ratio": 1.000044345855713,
262
+ "val/ratio_var": 2.319321311006206e-06
263
+ },
264
+ {
265
+ "episode": 312,
266
+ "epoch": 0.044571428571428574,
267
+ "eps": 4,
268
+ "loss/policy_avg": -4.981772508472204e-05,
269
+ "loss/value_avg": 0.0011141800787299871,
270
+ "lr": 4e-07,
271
+ "objective/entropy": 8.326139450073242,
272
+ "objective/kl": 0.03665222227573395,
273
+ "objective/non_score_reward": -0.001832611276768148,
274
+ "objective/rlhf_reward": -0.001832611276768148,
275
+ "objective/scores": 0.0,
276
+ "policy/approxkl_avg": 0.0002924801083281636,
277
+ "policy/clipfrac_avg": 0.00019654087373055518,
278
+ "policy/entropy_avg": 0.15716251730918884,
279
+ "step": 13,
280
+ "val/clipfrac_avg": 0.0,
281
+ "val/num_eos_tokens": 0,
282
+ "val/ratio": 1.0000745058059692,
283
+ "val/ratio_var": 9.655751682657865e-07
284
+ },
285
+ {
286
+ "episode": 336,
287
+ "epoch": 0.048,
288
+ "eps": 4,
289
+ "loss/policy_avg": -0.0008058485109359026,
290
+ "loss/value_avg": 0.0011609160574153066,
291
+ "lr": 4.3333333333333335e-07,
292
+ "objective/entropy": 10.01640510559082,
293
+ "objective/kl": 0.02480611763894558,
294
+ "objective/non_score_reward": -0.001240305951796472,
295
+ "objective/rlhf_reward": -0.001240305951796472,
296
+ "objective/scores": 0.0,
297
+ "policy/approxkl_avg": 0.00044258785783313215,
298
+ "policy/clipfrac_avg": 0.001965408679097891,
299
+ "policy/entropy_avg": 0.19537438452243805,
300
+ "step": 14,
301
+ "val/clipfrac_avg": 0.0,
302
+ "val/num_eos_tokens": 0,
303
+ "val/ratio": 0.9992372989654541,
304
+ "val/ratio_var": 2.064736008833279e-06
305
+ },
306
+ {
307
+ "episode": 360,
308
+ "epoch": 0.05142857142857143,
309
+ "eps": 4,
310
+ "loss/policy_avg": -0.0012431369395926595,
311
+ "loss/value_avg": 0.0011450606398284435,
312
+ "lr": 4.6666666666666666e-07,
313
+ "objective/entropy": 8.250815391540527,
314
+ "objective/kl": 0.026728516444563866,
315
+ "objective/non_score_reward": -0.0013364258920773864,
316
+ "objective/rlhf_reward": -0.0013364258920773864,
317
+ "objective/scores": 0.0,
318
+ "policy/approxkl_avg": 0.00041782623156905174,
319
+ "policy/clipfrac_avg": 0.0027515722904354334,
320
+ "policy/entropy_avg": 0.15008953213691711,
321
+ "step": 15,
322
+ "val/clipfrac_avg": 0.0,
323
+ "val/num_eos_tokens": 0,
324
+ "val/ratio": 0.9994743466377258,
325
+ "val/ratio_var": 1.3626447525894037e-06
326
+ },
327
+ {
328
+ "episode": 384,
329
+ "epoch": 0.054857142857142854,
330
+ "eps": 4,
331
+ "loss/policy_avg": -0.0016457759775221348,
332
+ "loss/value_avg": 0.0012071160599589348,
333
+ "lr": 5e-07,
334
+ "objective/entropy": 9.560142517089844,
335
+ "objective/kl": 0.04084904119372368,
336
+ "objective/non_score_reward": -0.0020424523390829563,
337
+ "objective/rlhf_reward": -0.0020424523390829563,
338
+ "objective/scores": 0.0,
339
+ "policy/approxkl_avg": 0.00030206821975298226,
340
+ "policy/clipfrac_avg": 0.000589622650295496,
341
+ "policy/entropy_avg": 0.19114547967910767,
342
+ "step": 16,
343
+ "val/clipfrac_avg": 0.0,
344
+ "val/num_eos_tokens": 0,
345
+ "val/ratio": 0.9999871253967285,
346
+ "val/ratio_var": 1.3688204489881173e-06
347
+ },
348
+ {
349
+ "episode": 408,
350
+ "epoch": 0.05828571428571429,
351
+ "eps": 4,
352
+ "loss/policy_avg": 0.00015279045328497887,
353
+ "loss/value_avg": 0.0011765253730118275,
354
+ "lr": 5.333333333333333e-07,
355
+ "objective/entropy": 10.473526000976562,
356
+ "objective/kl": 0.012600630521774292,
357
+ "objective/non_score_reward": -0.0006300313398241997,
358
+ "objective/rlhf_reward": -0.0006300313398241997,
359
+ "objective/scores": 0.0,
360
+ "policy/approxkl_avg": 0.000512410537339747,
361
+ "policy/clipfrac_avg": 0.0009827043395489454,
362
+ "policy/entropy_avg": 0.1867837905883789,
363
+ "step": 17,
364
+ "val/clipfrac_avg": 0.0,
365
+ "val/num_eos_tokens": 0,
366
+ "val/ratio": 1.0001708269119263,
367
+ "val/ratio_var": 3.6231228932592785e-06
368
+ },
369
+ {
370
+ "episode": 432,
371
+ "epoch": 0.061714285714285715,
372
+ "eps": 4,
373
+ "loss/policy_avg": -0.0015998110175132751,
374
+ "loss/value_avg": 0.0011272402480244637,
375
+ "lr": 5.666666666666666e-07,
376
+ "objective/entropy": 9.374881744384766,
377
+ "objective/kl": -0.022168656811118126,
378
+ "objective/non_score_reward": 0.001108432887122035,
379
+ "objective/rlhf_reward": 0.001108432887122035,
380
+ "objective/scores": 0.0,
381
+ "policy/approxkl_avg": 0.0003468349459581077,
382
+ "policy/clipfrac_avg": 0.00019654087373055518,
383
+ "policy/entropy_avg": 0.17824786901474,
384
+ "step": 18,
385
+ "val/clipfrac_avg": 0.0,
386
+ "val/num_eos_tokens": 0,
387
+ "val/ratio": 1.0005429983139038,
388
+ "val/ratio_var": 1.015850784824579e-06
389
+ },
390
+ {
391
+ "episode": 456,
392
+ "epoch": 0.06514285714285714,
393
+ "eps": 4,
394
+ "loss/policy_avg": 0.0005391768645495176,
395
+ "loss/value_avg": 0.0010727113112807274,
396
+ "lr": 6e-07,
397
+ "objective/entropy": 9.847137451171875,
398
+ "objective/kl": 0.10544447600841522,
399
+ "objective/non_score_reward": -0.0052722240798175335,
400
+ "objective/rlhf_reward": -0.0052722240798175335,
401
+ "objective/scores": 0.0,
402
+ "policy/approxkl_avg": 0.0003828281769528985,
403
+ "policy/clipfrac_avg": 0.000589622650295496,
404
+ "policy/entropy_avg": 0.191776305437088,
405
+ "step": 19,
406
+ "val/clipfrac_avg": 0.0,
407
+ "val/num_eos_tokens": 0,
408
+ "val/ratio": 0.9989888072013855,
409
+ "val/ratio_var": 1.1592291002671118e-06
410
+ },
411
+ {
412
+ "episode": 480,
413
+ "epoch": 0.06857142857142857,
414
+ "eps": 4,
415
+ "loss/policy_avg": 0.0004999454831704497,
416
+ "loss/value_avg": 0.0011963938595727086,
417
+ "lr": 6.333333333333332e-07,
418
+ "objective/entropy": 9.980831146240234,
419
+ "objective/kl": 0.009486228227615356,
420
+ "objective/non_score_reward": -0.0004743114986922592,
421
+ "objective/rlhf_reward": -0.0004743114986922592,
422
+ "objective/scores": 0.0,
423
+ "policy/approxkl_avg": 0.0003429501666687429,
424
+ "policy/clipfrac_avg": 0.0,
425
+ "policy/entropy_avg": 0.18964967131614685,
426
+ "step": 20,
427
+ "val/clipfrac_avg": 0.0,
428
+ "val/num_eos_tokens": 0,
429
+ "val/ratio": 0.9999833703041077,
430
+ "val/ratio_var": 3.019130190295982e-06
431
+ },
432
+ {
433
+ "episode": 504,
434
+ "epoch": 0.072,
435
+ "eps": 4,
436
+ "loss/policy_avg": -0.0006457410054281354,
437
+ "loss/value_avg": 0.0011750897392630577,
438
+ "lr": 6.666666666666666e-07,
439
+ "objective/entropy": 8.674556732177734,
440
+ "objective/kl": 0.16487348079681396,
441
+ "objective/non_score_reward": -0.008243675343692303,
442
+ "objective/rlhf_reward": -0.008243675343692303,
443
+ "objective/scores": 0.0,
444
+ "policy/approxkl_avg": 0.00034728588070720434,
445
+ "policy/clipfrac_avg": 0.0007861634949222207,
446
+ "policy/entropy_avg": 0.17806154489517212,
447
+ "step": 21,
448
+ "val/clipfrac_avg": 0.0,
449
+ "val/num_eos_tokens": 0,
450
+ "val/ratio": 0.9990400671958923,
451
+ "val/ratio_var": 1.2348247082627495e-06
452
+ },
453
+ {
454
+ "episode": 528,
455
+ "epoch": 0.07542857142857143,
456
+ "eps": 4,
457
+ "loss/policy_avg": -0.0015686429105699062,
458
+ "loss/value_avg": 0.001164754037745297,
459
+ "lr": 7e-07,
460
+ "objective/entropy": 10.79302978515625,
461
+ "objective/kl": -0.08593261241912842,
462
+ "objective/non_score_reward": 0.004296630620956421,
463
+ "objective/rlhf_reward": 0.004296630620956421,
464
+ "objective/scores": 0.0,
465
+ "policy/approxkl_avg": 0.00043445732444524765,
466
+ "policy/clipfrac_avg": 0.001768867950886488,
467
+ "policy/entropy_avg": 0.1922873705625534,
468
+ "step": 22,
469
+ "val/clipfrac_avg": 0.0,
470
+ "val/num_eos_tokens": 0,
471
+ "val/ratio": 1.0014443397521973,
472
+ "val/ratio_var": 3.1963945730240084e-06
473
+ },
474
+ {
475
+ "episode": 552,
476
+ "epoch": 0.07885714285714286,
477
+ "eps": 4,
478
+ "loss/policy_avg": -0.0007012488786131144,
479
+ "loss/value_avg": 0.0011740096379071474,
480
+ "lr": 7.333333333333332e-07,
481
+ "objective/entropy": 9.420793533325195,
482
+ "objective/kl": 0.13005897402763367,
483
+ "objective/non_score_reward": -0.006502948235720396,
484
+ "objective/rlhf_reward": -0.006502948235720396,
485
+ "objective/scores": 0.0,
486
+ "policy/approxkl_avg": 0.000359610392479226,
487
+ "policy/clipfrac_avg": 0.0013757860288023949,
488
+ "policy/entropy_avg": 0.1827276051044464,
489
+ "step": 23,
490
+ "val/clipfrac_avg": 0.0,
491
+ "val/num_eos_tokens": 0,
492
+ "val/ratio": 0.9988819360733032,
493
+ "val/ratio_var": 1.4016917475601076e-06
494
+ },
495
+ {
496
+ "episode": 576,
497
+ "epoch": 0.08228571428571428,
498
+ "eps": 4,
499
+ "loss/policy_avg": -0.0009957049041986465,
500
+ "loss/value_avg": 0.001171375741250813,
501
+ "lr": 7.666666666666667e-07,
502
+ "objective/entropy": 12.150233268737793,
503
+ "objective/kl": 0.11499027907848358,
504
+ "objective/non_score_reward": -0.005749514326453209,
505
+ "objective/rlhf_reward": -0.005749514326453209,
506
+ "objective/scores": 0.0,
507
+ "policy/approxkl_avg": 0.0012159445323050022,
508
+ "policy/clipfrac_avg": 0.004323899280279875,
509
+ "policy/entropy_avg": 0.22310155630111694,
510
+ "step": 24,
511
+ "val/clipfrac_avg": 0.0,
512
+ "val/num_eos_tokens": 0,
513
+ "val/ratio": 0.9989429712295532,
514
+ "val/ratio_var": 1.6364158454962308e-06
515
+ },
516
+ {
517
+ "episode": 600,
518
+ "epoch": 0.08571428571428572,
519
+ "eps": 4,
520
+ "loss/policy_avg": 5.847762804478407e-05,
521
+ "loss/value_avg": 0.0012743598781526089,
522
+ "lr": 8e-07,
523
+ "objective/entropy": 10.221760749816895,
524
+ "objective/kl": -0.07403554022312164,
525
+ "objective/non_score_reward": 0.0037017769645899534,
526
+ "objective/rlhf_reward": 0.0037017769645899534,
527
+ "objective/scores": 0.0,
528
+ "policy/approxkl_avg": 0.00038175773806869984,
529
+ "policy/clipfrac_avg": 0.00039308174746111035,
530
+ "policy/entropy_avg": 0.1923346221446991,
531
+ "step": 25,
532
+ "val/clipfrac_avg": 0.0,
533
+ "val/num_eos_tokens": 0,
534
+ "val/ratio": 1.0002760887145996,
535
+ "val/ratio_var": 2.4850394311215496e-06
536
+ },
537
+ {
538
+ "episode": 624,
539
+ "epoch": 0.08914285714285715,
540
+ "eps": 4,
541
+ "loss/policy_avg": 0.0005327528342604637,
542
+ "loss/value_avg": 0.001223960192874074,
543
+ "lr": 8.333333333333333e-07,
544
+ "objective/entropy": 9.961271286010742,
545
+ "objective/kl": -0.03506360575556755,
546
+ "objective/non_score_reward": 0.0017531800549477339,
547
+ "objective/rlhf_reward": 0.0017531800549477339,
548
+ "objective/scores": 0.0,
549
+ "policy/approxkl_avg": 0.0003690699813887477,
550
+ "policy/clipfrac_avg": 0.00019654087373055518,
551
+ "policy/entropy_avg": 0.19056138396263123,
552
+ "step": 26,
553
+ "val/clipfrac_avg": 0.0,
554
+ "val/num_eos_tokens": 0,
555
+ "val/ratio": 0.9998593926429749,
556
+ "val/ratio_var": 2.2398712644644547e-06
557
+ },
558
+ {
559
+ "episode": 648,
560
+ "epoch": 0.09257142857142857,
561
+ "eps": 4,
562
+ "loss/policy_avg": -0.0012908531352877617,
563
+ "loss/value_avg": 0.0011609859066084027,
564
+ "lr": 8.666666666666667e-07,
565
+ "objective/entropy": 10.788581848144531,
566
+ "objective/kl": -0.035950906574726105,
567
+ "objective/non_score_reward": 0.0017975454684346914,
568
+ "objective/rlhf_reward": 0.0017975454684346914,
569
+ "objective/scores": 0.0,
570
+ "policy/approxkl_avg": 0.00042478417162783444,
571
+ "policy/clipfrac_avg": 0.001179245300590992,
572
+ "policy/entropy_avg": 0.20873138308525085,
573
+ "step": 27,
574
+ "val/clipfrac_avg": 0.0,
575
+ "val/num_eos_tokens": 0,
576
+ "val/ratio": 0.9998935461044312,
577
+ "val/ratio_var": 2.4417051918135257e-06
578
+ },
579
+ {
580
+ "episode": 672,
581
+ "epoch": 0.096,
582
+ "eps": 4,
583
+ "loss/policy_avg": 0.000542065070476383,
584
+ "loss/value_avg": 0.0012408210895955563,
585
+ "lr": 9e-07,
586
+ "objective/entropy": 8.87173080444336,
587
+ "objective/kl": -0.033091746270656586,
588
+ "objective/non_score_reward": 0.0016545872204005718,
589
+ "objective/rlhf_reward": 0.0016545872204005718,
590
+ "objective/scores": 0.0,
591
+ "policy/approxkl_avg": 0.00035692937672138214,
592
+ "policy/clipfrac_avg": 0.0,
593
+ "policy/entropy_avg": 0.17584078013896942,
594
+ "step": 28,
595
+ "val/clipfrac_avg": 0.0,
596
+ "val/num_eos_tokens": 0,
597
+ "val/ratio": 0.9999797940254211,
598
+ "val/ratio_var": 2.1010207547078608e-06
599
+ },
600
+ {
601
+ "episode": 696,
602
+ "epoch": 0.09942857142857142,
603
+ "eps": 3,
604
+ "loss/policy_avg": 0.00025542714865878224,
605
+ "loss/value_avg": 0.0009407824254594743,
606
+ "lr": 9.333333333333333e-07,
607
+ "objective/entropy": 10.091318130493164,
608
+ "objective/kl": -0.0042562782764434814,
609
+ "objective/non_score_reward": 0.00021281388762872666,
610
+ "objective/rlhf_reward": 0.00021281388762872666,
611
+ "objective/scores": 0.0,
612
+ "policy/approxkl_avg": 0.0004177533555775881,
613
+ "policy/clipfrac_avg": 0.000589622650295496,
614
+ "policy/entropy_avg": 0.20332473516464233,
615
+ "step": 29,
616
+ "val/clipfrac_avg": 0.0,
617
+ "val/num_eos_tokens": 0,
618
+ "val/ratio": 1.0008416175842285,
619
+ "val/ratio_var": 4.070296654390404e-06
620
+ },
621
+ {
622
+ "episode": 720,
623
+ "epoch": 0.10285714285714286,
624
+ "eps": 3,
625
+ "loss/policy_avg": -0.0018287734128534794,
626
+ "loss/value_avg": 0.0010721104918047786,
627
+ "lr": 9.666666666666666e-07,
628
+ "objective/entropy": 11.005254745483398,
629
+ "objective/kl": 0.05638129264116287,
630
+ "objective/non_score_reward": -0.002819064538925886,
631
+ "objective/rlhf_reward": -0.002819064538925886,
632
+ "objective/scores": 0.0,
633
+ "policy/approxkl_avg": 0.00047117556096054614,
634
+ "policy/clipfrac_avg": 0.0015723269898444414,
635
+ "policy/entropy_avg": 0.20323297381401062,
636
+ "step": 30,
637
+ "val/clipfrac_avg": 0.0,
638
+ "val/num_eos_tokens": 0,
639
+ "val/ratio": 0.9995233416557312,
640
+ "val/ratio_var": 2.1699356693716254e-06
641
+ },
642
+ {
643
+ "episode": 744,
644
+ "epoch": 0.10628571428571429,
645
+ "eps": 3,
646
+ "loss/policy_avg": -0.0013574458425864577,
647
+ "loss/value_avg": 0.0011641676537692547,
648
+ "lr": 1e-06,
649
+ "objective/entropy": 11.450809478759766,
650
+ "objective/kl": -0.010124157182872295,
651
+ "objective/non_score_reward": 0.000506207812577486,
652
+ "objective/rlhf_reward": 0.000506207812577486,
653
+ "objective/scores": 0.0,
654
+ "policy/approxkl_avg": 0.0004096939228475094,
655
+ "policy/clipfrac_avg": 0.00039308174746111035,
656
+ "policy/entropy_avg": 0.2194271683692932,
657
+ "step": 31,
658
+ "val/clipfrac_avg": 0.0,
659
+ "val/num_eos_tokens": 0,
660
+ "val/ratio": 1.000313401222229,
661
+ "val/ratio_var": 2.678104010556126e-06
662
+ },
663
+ {
664
+ "episode": 768,
665
+ "epoch": 0.10971428571428571,
666
+ "eps": 3,
667
+ "loss/policy_avg": -0.0002802757080644369,
668
+ "loss/value_avg": 0.001182440435513854,
669
+ "lr": 9.999676499856762e-07,
670
+ "objective/entropy": 9.76666259765625,
671
+ "objective/kl": 0.06829522550106049,
672
+ "objective/non_score_reward": -0.0034147612750530243,
673
+ "objective/rlhf_reward": -0.0034147612750530243,
674
+ "objective/scores": 0.0,
675
+ "policy/approxkl_avg": 0.00030742710805498064,
676
+ "policy/clipfrac_avg": 0.00039308174746111035,
677
+ "policy/entropy_avg": 0.19262900948524475,
678
+ "step": 32,
679
+ "val/clipfrac_avg": 0.0,
680
+ "val/num_eos_tokens": 0,
681
+ "val/ratio": 1.0005744695663452,
682
+ "val/ratio_var": 2.010586285905447e-06
683
+ },
684
+ {
685
+ "episode": 792,
686
+ "epoch": 0.11314285714285714,
687
+ "eps": 3,
688
+ "loss/policy_avg": -4.241405986249447e-05,
689
+ "loss/value_avg": 0.001140992040745914,
690
+ "lr": 9.998706045939205e-07,
691
+ "objective/entropy": 9.192793846130371,
692
+ "objective/kl": 0.011741682887077332,
693
+ "objective/non_score_reward": -0.0005870835739187896,
694
+ "objective/rlhf_reward": -0.0005870835739187896,
695
+ "objective/scores": 0.0,
696
+ "policy/approxkl_avg": 0.0003442139131948352,
697
+ "policy/clipfrac_avg": 0.0007861634949222207,
698
+ "policy/entropy_avg": 0.18409989774227142,
699
+ "step": 33,
700
+ "val/clipfrac_avg": 0.0,
701
+ "val/num_eos_tokens": 0,
702
+ "val/ratio": 1.0005278587341309,
703
+ "val/ratio_var": 2.262753923787386e-06
704
+ },
705
+ {
706
+ "episode": 816,
707
+ "epoch": 0.11657142857142858,
708
+ "eps": 3,
709
+ "loss/policy_avg": -0.0008663232438266277,
710
+ "loss/value_avg": 0.001219075871631503,
711
+ "lr": 9.997088777777095e-07,
712
+ "objective/entropy": 9.554176330566406,
713
+ "objective/kl": 0.09085823595523834,
714
+ "objective/non_score_reward": -0.004542911425232887,
715
+ "objective/rlhf_reward": -0.004542911425232887,
716
+ "objective/scores": 0.0,
717
+ "policy/approxkl_avg": 0.0003868684871122241,
718
+ "policy/clipfrac_avg": 0.0009827043395489454,
719
+ "policy/entropy_avg": 0.2021941840648651,
720
+ "step": 34,
721
+ "val/clipfrac_avg": 0.0,
722
+ "val/num_eos_tokens": 0,
723
+ "val/ratio": 0.9998293519020081,
724
+ "val/ratio_var": 1.1972449556196807e-06
725
+ },
726
+ {
727
+ "episode": 840,
728
+ "epoch": 0.12,
729
+ "eps": 3,
730
+ "loss/policy_avg": -1.8059727153740823e-05,
731
+ "loss/value_avg": 0.00110139069147408,
732
+ "lr": 9.994824927897762e-07,
733
+ "objective/entropy": 10.6250638961792,
734
+ "objective/kl": -0.057124651968479156,
735
+ "objective/non_score_reward": 0.002856232225894928,
736
+ "objective/rlhf_reward": 0.002856232225894928,
737
+ "objective/scores": 0.0,
738
+ "policy/approxkl_avg": 0.00036990485386922956,
739
+ "policy/clipfrac_avg": 0.00039308174746111035,
740
+ "policy/entropy_avg": 0.19656652212142944,
741
+ "step": 35,
742
+ "val/clipfrac_avg": 0.0,
743
+ "val/num_eos_tokens": 0,
744
+ "val/ratio": 1.000251054763794,
745
+ "val/ratio_var": 3.1737645258544944e-06
746
+ },
747
+ {
748
+ "episode": 864,
749
+ "epoch": 0.12342857142857143,
750
+ "eps": 3,
751
+ "loss/policy_avg": 0.0005211837124079466,
752
+ "loss/value_avg": 0.001098247361369431,
753
+ "lr": 9.99191482179265e-07,
754
+ "objective/entropy": 11.432059288024902,
755
+ "objective/kl": 0.03775228559970856,
756
+ "objective/non_score_reward": -0.001887614605948329,
757
+ "objective/rlhf_reward": -0.001887614605948329,
758
+ "objective/scores": 0.0,
759
+ "policy/approxkl_avg": 0.0015855329111218452,
760
+ "policy/clipfrac_avg": 0.001965408679097891,
761
+ "policy/entropy_avg": 0.2018502950668335,
762
+ "step": 36,
763
+ "val/clipfrac_avg": 0.0,
764
+ "val/num_eos_tokens": 0,
765
+ "val/ratio": 0.9998794794082642,
766
+ "val/ratio_var": 2.63856213678082e-06
767
+ },
768
+ {
769
+ "episode": 888,
770
+ "epoch": 0.12685714285714286,
771
+ "eps": 3,
772
+ "loss/policy_avg": -0.0003723090048879385,
773
+ "loss/value_avg": 0.0011761891655623913,
774
+ "lr": 9.988358877870534e-07,
775
+ "objective/entropy": 13.564403533935547,
776
+ "objective/kl": 0.14848901331424713,
777
+ "objective/non_score_reward": -0.007424450945109129,
778
+ "objective/rlhf_reward": -0.007424450945109129,
779
+ "objective/scores": 0.0,
780
+ "policy/approxkl_avg": 0.0005069933249615133,
781
+ "policy/clipfrac_avg": 0.000589622650295496,
782
+ "policy/entropy_avg": 0.21795369684696198,
783
+ "step": 37,
784
+ "val/clipfrac_avg": 0.0,
785
+ "val/num_eos_tokens": 0,
786
+ "val/ratio": 0.9994122982025146,
787
+ "val/ratio_var": 3.516516017043614e-06
788
+ },
789
+ {
790
+ "episode": 912,
791
+ "epoch": 0.13028571428571428,
792
+ "eps": 3,
793
+ "loss/policy_avg": -0.0005580224096775055,
794
+ "loss/value_avg": 0.0010930404532700777,
795
+ "lr": 9.984157607397357e-07,
796
+ "objective/entropy": 12.46867561340332,
797
+ "objective/kl": 0.04366648197174072,
798
+ "objective/non_score_reward": -0.00218332395888865,
799
+ "objective/rlhf_reward": -0.00218332395888865,
800
+ "objective/scores": 0.0,
801
+ "policy/approxkl_avg": 0.000503862276673317,
802
+ "policy/clipfrac_avg": 0.000589622650295496,
803
+ "policy/entropy_avg": 0.2015783190727234,
804
+ "step": 38,
805
+ "val/clipfrac_avg": 0.0,
806
+ "val/num_eos_tokens": 0,
807
+ "val/ratio": 1.0004000663757324,
808
+ "val/ratio_var": 6.4164978539338335e-06
809
+ },
810
+ {
811
+ "episode": 936,
812
+ "epoch": 0.1337142857142857,
813
+ "eps": 3,
814
+ "loss/policy_avg": -0.0002338151098228991,
815
+ "loss/value_avg": 0.0011000724043697119,
816
+ "lr": 9.979311614422718e-07,
817
+ "objective/entropy": 11.266685485839844,
818
+ "objective/kl": 0.10951797664165497,
819
+ "objective/non_score_reward": -0.005475898738950491,
820
+ "objective/rlhf_reward": -0.005475898738950491,
821
+ "objective/scores": 0.0,
822
+ "policy/approxkl_avg": 0.0004268661141395569,
823
+ "policy/clipfrac_avg": 0.000589622650295496,
824
+ "policy/entropy_avg": 0.21669596433639526,
825
+ "step": 39,
826
+ "val/clipfrac_avg": 0.0,
827
+ "val/num_eos_tokens": 0,
828
+ "val/ratio": 0.9992050528526306,
829
+ "val/ratio_var": 1.6559835103180376e-06
830
+ },
831
+ {
832
+ "episode": 960,
833
+ "epoch": 0.13714285714285715,
834
+ "eps": 3,
835
+ "loss/policy_avg": -0.0007765925256535411,
836
+ "loss/value_avg": 0.001088449265807867,
837
+ "lr": 9.973821595693026e-07,
838
+ "objective/entropy": 12.338579177856445,
839
+ "objective/kl": 0.06209728866815567,
840
+ "objective/non_score_reward": -0.0031048643868416548,
841
+ "objective/rlhf_reward": -0.0031048643868416548,
842
+ "objective/scores": 0.0,
843
+ "policy/approxkl_avg": 0.0005026735016144812,
844
+ "policy/clipfrac_avg": 0.001179245300590992,
845
+ "policy/entropy_avg": 0.2244863510131836,
846
+ "step": 40,
847
+ "val/clipfrac_avg": 0.0,
848
+ "val/num_eos_tokens": 0,
849
+ "val/ratio": 0.9999690651893616,
850
+ "val/ratio_var": 5.0828230087063275e-06
851
+ },
852
+ {
853
+ "episode": 984,
854
+ "epoch": 0.14057142857142857,
855
+ "eps": 3,
856
+ "loss/policy_avg": 0.00013598555233329535,
857
+ "loss/value_avg": 0.001187118818052113,
858
+ "lr": 9.967688340551327e-07,
859
+ "objective/entropy": 9.387279510498047,
860
+ "objective/kl": -0.033916082233190536,
861
+ "objective/non_score_reward": 0.0016958042979240417,
862
+ "objective/rlhf_reward": 0.0016958042979240417,
863
+ "objective/scores": 0.0,
864
+ "policy/approxkl_avg": 0.0004025290545541793,
865
+ "policy/clipfrac_avg": 0.0013757861452177167,
866
+ "policy/entropy_avg": 0.19263054430484772,
867
+ "step": 41,
868
+ "val/clipfrac_avg": 0.0,
869
+ "val/num_eos_tokens": 0,
870
+ "val/ratio": 1.0011463165283203,
871
+ "val/ratio_var": 3.6780629670829512e-06
872
+ },
873
+ {
874
+ "episode": 1008,
875
+ "epoch": 0.144,
876
+ "eps": 3,
877
+ "loss/policy_avg": -0.001398796564899385,
878
+ "loss/value_avg": 0.0012791166082024574,
879
+ "lr": 9.960912730823802e-07,
880
+ "objective/entropy": 9.443194389343262,
881
+ "objective/kl": 0.040594689548015594,
882
+ "objective/non_score_reward": -0.0020297346636652946,
883
+ "objective/rlhf_reward": -0.0020297346636652946,
884
+ "objective/scores": 0.0,
885
+ "policy/approxkl_avg": 0.00038655230309814215,
886
+ "policy/clipfrac_avg": 0.0009827043395489454,
887
+ "policy/entropy_avg": 0.17716416716575623,
888
+ "step": 42,
889
+ "val/clipfrac_avg": 0.0,
890
+ "val/num_eos_tokens": 0,
891
+ "val/ratio": 0.9991838335990906,
892
+ "val/ratio_var": 1.4638164884672733e-06
893
+ },
894
+ {
895
+ "episode": 1032,
896
+ "epoch": 0.14742857142857144,
897
+ "eps": 3,
898
+ "loss/policy_avg": -0.0008904861751943827,
899
+ "loss/value_avg": 0.0012651337310671806,
900
+ "lr": 9.953495740692994e-07,
901
+ "objective/entropy": 8.784332275390625,
902
+ "objective/kl": 0.048220910131931305,
903
+ "objective/non_score_reward": -0.0024110455997288227,
904
+ "objective/rlhf_reward": -0.0024110455997288227,
905
+ "objective/scores": 0.0,
906
+ "policy/approxkl_avg": 0.00036320951767265797,
907
+ "policy/clipfrac_avg": 0.00039308174746111035,
908
+ "policy/entropy_avg": 0.19501766562461853,
909
+ "step": 43,
910
+ "val/clipfrac_avg": 0.0,
911
+ "val/num_eos_tokens": 0,
912
+ "val/ratio": 0.9992011785507202,
913
+ "val/ratio_var": 7.953665885906958e-07
914
+ },
915
+ {
916
+ "episode": 1056,
917
+ "epoch": 0.15085714285714286,
918
+ "eps": 3,
919
+ "loss/policy_avg": -0.00040724524296820164,
920
+ "loss/value_avg": 0.0012381336418911815,
921
+ "lr": 9.945438436557734e-07,
922
+ "objective/entropy": 10.724536895751953,
923
+ "objective/kl": 0.09735743701457977,
924
+ "objective/non_score_reward": -0.004867871757596731,
925
+ "objective/rlhf_reward": -0.004867871757596731,
926
+ "objective/scores": 0.0,
927
+ "policy/approxkl_avg": 0.0004834880237467587,
928
+ "policy/clipfrac_avg": 0.0009827043395489454,
929
+ "policy/entropy_avg": 0.19912828505039215,
930
+ "step": 44,
931
+ "val/clipfrac_avg": 0.0,
932
+ "val/num_eos_tokens": 0,
933
+ "val/ratio": 0.9993612766265869,
934
+ "val/ratio_var": 3.948975518142106e-06
935
+ },
936
+ {
937
+ "episode": 1080,
938
+ "epoch": 0.15428571428571428,
939
+ "eps": 3,
940
+ "loss/policy_avg": -0.0007694043451920152,
941
+ "loss/value_avg": 0.0012855801032856107,
942
+ "lr": 9.93674197687982e-07,
943
+ "objective/entropy": 10.774508476257324,
944
+ "objective/kl": 0.036663156002759933,
945
+ "objective/non_score_reward": -0.0018331576138734818,
946
+ "objective/rlhf_reward": -0.0018331576138734818,
947
+ "objective/scores": 0.0,
948
+ "policy/approxkl_avg": 0.0003691558085847646,
949
+ "policy/clipfrac_avg": 0.0007861634949222207,
950
+ "policy/entropy_avg": 0.19582445919513702,
951
+ "step": 45,
952
+ "val/clipfrac_avg": 0.0,
953
+ "val/num_eos_tokens": 0,
954
+ "val/ratio": 1.0001473426818848,
955
+ "val/ratio_var": 3.0011931357876165e-06
956
+ },
957
+ {
958
+ "episode": 1104,
959
+ "epoch": 0.15771428571428572,
960
+ "eps": 3,
961
+ "loss/policy_avg": -0.0010882224887609482,
962
+ "loss/value_avg": 0.0011191555531695485,
963
+ "lr": 9.927407612017446e-07,
964
+ "objective/entropy": 8.94318962097168,
965
+ "objective/kl": 0.036568351089954376,
966
+ "objective/non_score_reward": -0.0018284174147993326,
967
+ "objective/rlhf_reward": -0.0018284174147993326,
968
+ "objective/scores": 0.0,
969
+ "policy/approxkl_avg": 0.00030608323868364096,
970
+ "policy/clipfrac_avg": 0.000589622650295496,
971
+ "policy/entropy_avg": 0.18505753576755524,
972
+ "step": 46,
973
+ "val/clipfrac_avg": 0.0,
974
+ "val/num_eos_tokens": 0,
975
+ "val/ratio": 0.9989262819290161,
976
+ "val/ratio_var": 2.5265023850806756e-06
977
+ },
978
+ {
979
+ "episode": 1128,
980
+ "epoch": 0.16114285714285714,
981
+ "eps": 3,
982
+ "loss/policy_avg": 6.927899084985256e-05,
983
+ "loss/value_avg": 0.0010387528454884887,
984
+ "lr": 9.91743668404545e-07,
985
+ "objective/entropy": 13.284429550170898,
986
+ "objective/kl": 0.0841490626335144,
987
+ "objective/non_score_reward": -0.00420745275914669,
988
+ "objective/rlhf_reward": -0.00420745275914669,
989
+ "objective/scores": 0.0,
990
+ "policy/approxkl_avg": 0.0005432004109025002,
991
+ "policy/clipfrac_avg": 0.0011792451841756701,
992
+ "policy/entropy_avg": 0.2116156369447708,
993
+ "step": 47,
994
+ "val/clipfrac_avg": 0.0,
995
+ "val/num_eos_tokens": 0,
996
+ "val/ratio": 0.9999104738235474,
997
+ "val/ratio_var": 2.7350765776645858e-06
998
+ },
999
+ {
1000
+ "episode": 1152,
1001
+ "epoch": 0.16457142857142856,
1002
+ "eps": 3,
1003
+ "loss/policy_avg": -0.00019468856044113636,
1004
+ "loss/value_avg": 0.0011783144436776638,
1005
+ "lr": 9.906830626562331e-07,
1006
+ "objective/entropy": 11.494680404663086,
1007
+ "objective/kl": 0.04910100996494293,
1008
+ "objective/non_score_reward": -0.0024550508242100477,
1009
+ "objective/rlhf_reward": -0.0024550508242100477,
1010
+ "objective/scores": 0.0,
1011
+ "policy/approxkl_avg": 0.00043969100806862116,
1012
+ "policy/clipfrac_avg": 0.001179245300590992,
1013
+ "policy/entropy_avg": 0.23010241985321045,
1014
+ "step": 48,
1015
+ "val/clipfrac_avg": 0.0,
1016
+ "val/num_eos_tokens": 0,
1017
+ "val/ratio": 1.0009230375289917,
1018
+ "val/ratio_var": 1.998189873120282e-06
1019
+ },
1020
+ {
1021
+ "episode": 1176,
1022
+ "epoch": 0.168,
1023
+ "eps": 3,
1024
+ "loss/policy_avg": -0.0006489114603027701,
1025
+ "loss/value_avg": 0.001087805489078164,
1026
+ "lr": 9.89559096448414e-07,
1027
+ "objective/entropy": 11.260568618774414,
1028
+ "objective/kl": 0.0814318135380745,
1029
+ "objective/non_score_reward": -0.004071590956300497,
1030
+ "objective/rlhf_reward": -0.004071590956300497,
1031
+ "objective/scores": 0.0,
1032
+ "policy/approxkl_avg": 0.00044973386684432626,
1033
+ "policy/clipfrac_avg": 0.0007861634949222207,
1034
+ "policy/entropy_avg": 0.1947825849056244,
1035
+ "step": 49,
1036
+ "val/clipfrac_avg": 0.0,
1037
+ "val/num_eos_tokens": 0,
1038
+ "val/ratio": 1.0001856088638306,
1039
+ "val/ratio_var": 8.380131362173415e-07
1040
+ },
1041
+ {
1042
+ "episode": 1200,
1043
+ "epoch": 0.17142857142857143,
1044
+ "eps": 3,
1045
+ "loss/policy_avg": -0.0005568858468905091,
1046
+ "loss/value_avg": 0.0010799263836815953,
1047
+ "lr": 9.883719313825227e-07,
1048
+ "objective/entropy": 12.4381742477417,
1049
+ "objective/kl": 0.00899545382708311,
1050
+ "objective/non_score_reward": -0.00044977269135415554,
1051
+ "objective/rlhf_reward": -0.00044977269135415554,
1052
+ "objective/scores": 0.0,
1053
+ "policy/approxkl_avg": 0.0008998862467706203,
1054
+ "policy/clipfrac_avg": 0.00039308174746111035,
1055
+ "policy/entropy_avg": 0.2059018909931183,
1056
+ "step": 50,
1057
+ "val/clipfrac_avg": 0.0,
1058
+ "val/num_eos_tokens": 0,
1059
+ "val/ratio": 0.9994922280311584,
1060
+ "val/ratio_var": 4.479263679968426e-06
1061
+ }
1062
+ ],
1063
+ "logging_steps": 1,
1064
+ "max_steps": 292,
1065
+ "num_input_tokens_seen": 0,
1066
+ "num_train_epochs": 1.0,
1067
+ "save_steps": 50,
1068
+ "stateful_callbacks": {
1069
+ "TrainerControl": {
1070
+ "args": {
1071
+ "should_epoch_stop": false,
1072
+ "should_evaluate": false,
1073
+ "should_log": true,
1074
+ "should_save": true,
1075
+ "should_training_stop": false
1076
+ },
1077
+ "attributes": {}
1078
+ }
1079
+ },
1080
+ "total_flos": 0,
1081
+ "train_batch_size": null,
1082
+ "trial_name": null,
1083
+ "trial_params": null
1084
+ }