aaronkaplan commited on
Commit
0982c6d
·
1 Parent(s): d787fb4

initial release v3

Browse files
adapter_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "models/meta-llama_Meta-Llama-3-8B",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "loftq_config": {},
12
+ "lora_alpha": 256,
13
+ "lora_dropout": 0.05,
14
+ "megatron_config": null,
15
+ "megatron_core": "megatron.core",
16
+ "modules_to_save": null,
17
+ "peft_type": "LORA",
18
+ "r": 128,
19
+ "rank_pattern": {},
20
+ "revision": null,
21
+ "target_modules": [
22
+ "v_proj",
23
+ "q_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM",
26
+ "use_rslora": false
27
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5055b89b8fe192656f23926b31901bbb81d0bb979f6566f1d692fffc5998b01f
3
+ size 218150282
training_graph.json ADDED
@@ -0,0 +1,956 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "current_steps": 63,
4
+ "loss": 2.8305,
5
+ "learning_rate": 0.0005,
6
+ "epoch": 0.02549800796812749
7
+ },
8
+ {
9
+ "current_steps": 127,
10
+ "loss": 2.7784,
11
+ "learning_rate": 0.001,
12
+ "epoch": 0.05099601593625498
13
+ },
14
+ {
15
+ "current_steps": 191,
16
+ "loss": 2.6623,
17
+ "learning_rate": 0.0009974226804123712,
18
+ "epoch": 0.07649402390438247
19
+ },
20
+ {
21
+ "current_steps": 255,
22
+ "loss": 2.6442,
23
+ "learning_rate": 0.0009948453608247423,
24
+ "epoch": 0.10199203187250996
25
+ },
26
+ {
27
+ "current_steps": 319,
28
+ "loss": 2.822,
29
+ "learning_rate": 0.0009922680412371135,
30
+ "epoch": 0.12749003984063745
31
+ },
32
+ {
33
+ "current_steps": 383,
34
+ "loss": 2.6869,
35
+ "learning_rate": 0.0009896907216494846,
36
+ "epoch": 0.15298804780876493
37
+ },
38
+ {
39
+ "current_steps": 447,
40
+ "loss": 2.5865,
41
+ "learning_rate": 0.0009871134020618558,
42
+ "epoch": 0.17848605577689244
43
+ },
44
+ {
45
+ "current_steps": 511,
46
+ "loss": 2.6222,
47
+ "learning_rate": 0.000984536082474227,
48
+ "epoch": 0.20398406374501993
49
+ },
50
+ {
51
+ "current_steps": 575,
52
+ "loss": 2.5421,
53
+ "learning_rate": 0.0009819587628865979,
54
+ "epoch": 0.2294820717131474
55
+ },
56
+ {
57
+ "current_steps": 639,
58
+ "loss": 2.5766,
59
+ "learning_rate": 0.000979381443298969,
60
+ "epoch": 0.2549800796812749
61
+ },
62
+ {
63
+ "current_steps": 703,
64
+ "loss": 2.5266,
65
+ "learning_rate": 0.0009768041237113402,
66
+ "epoch": 0.2804780876494024
67
+ },
68
+ {
69
+ "current_steps": 767,
70
+ "loss": 2.5239,
71
+ "learning_rate": 0.0009742268041237113,
72
+ "epoch": 0.30597609561752986
73
+ },
74
+ {
75
+ "current_steps": 831,
76
+ "loss": 2.4799,
77
+ "learning_rate": 0.0009716494845360825,
78
+ "epoch": 0.3314741035856574
79
+ },
80
+ {
81
+ "current_steps": 895,
82
+ "loss": 2.5332,
83
+ "learning_rate": 0.0009690721649484536,
84
+ "epoch": 0.3569721115537849
85
+ },
86
+ {
87
+ "current_steps": 959,
88
+ "loss": 2.4283,
89
+ "learning_rate": 0.0009664948453608248,
90
+ "epoch": 0.38247011952191234
91
+ },
92
+ {
93
+ "current_steps": 1023,
94
+ "loss": 2.4595,
95
+ "learning_rate": 0.0009639175257731959,
96
+ "epoch": 0.40796812749003986
97
+ },
98
+ {
99
+ "current_steps": 1087,
100
+ "loss": 2.4821,
101
+ "learning_rate": 0.0009613402061855671,
102
+ "epoch": 0.4334661354581673
103
+ },
104
+ {
105
+ "current_steps": 1151,
106
+ "loss": 2.4473,
107
+ "learning_rate": 0.0009587628865979382,
108
+ "epoch": 0.4589641434262948
109
+ },
110
+ {
111
+ "current_steps": 1215,
112
+ "loss": 2.4614,
113
+ "learning_rate": 0.0009561855670103094,
114
+ "epoch": 0.48446215139442234
115
+ },
116
+ {
117
+ "current_steps": 1279,
118
+ "loss": 2.4037,
119
+ "learning_rate": 0.0009536082474226805,
120
+ "epoch": 0.5099601593625498
121
+ },
122
+ {
123
+ "current_steps": 1343,
124
+ "loss": 2.4243,
125
+ "learning_rate": 0.0009510309278350515,
126
+ "epoch": 0.5354581673306773
127
+ },
128
+ {
129
+ "current_steps": 1407,
130
+ "loss": 2.3534,
131
+ "learning_rate": 0.0009484536082474226,
132
+ "epoch": 0.5609561752988048
133
+ },
134
+ {
135
+ "current_steps": 1471,
136
+ "loss": 2.406,
137
+ "learning_rate": 0.0009458762886597938,
138
+ "epoch": 0.5864541832669322
139
+ },
140
+ {
141
+ "current_steps": 1535,
142
+ "loss": 2.483,
143
+ "learning_rate": 0.0009432989690721649,
144
+ "epoch": 0.6119521912350597
145
+ },
146
+ {
147
+ "current_steps": 1599,
148
+ "loss": 2.4313,
149
+ "learning_rate": 0.0009407216494845361,
150
+ "epoch": 0.6374501992031872
151
+ },
152
+ {
153
+ "current_steps": 1663,
154
+ "loss": 2.4101,
155
+ "learning_rate": 0.0009381443298969072,
156
+ "epoch": 0.6629482071713148
157
+ },
158
+ {
159
+ "current_steps": 1727,
160
+ "loss": 2.3847,
161
+ "learning_rate": 0.0009355670103092784,
162
+ "epoch": 0.6884462151394423
163
+ },
164
+ {
165
+ "current_steps": 1791,
166
+ "loss": 2.3194,
167
+ "learning_rate": 0.0009329896907216495,
168
+ "epoch": 0.7139442231075698
169
+ },
170
+ {
171
+ "current_steps": 1855,
172
+ "loss": 2.3921,
173
+ "learning_rate": 0.0009304123711340207,
174
+ "epoch": 0.7394422310756972
175
+ },
176
+ {
177
+ "current_steps": 1919,
178
+ "loss": 2.4158,
179
+ "learning_rate": 0.0009278350515463918,
180
+ "epoch": 0.7649402390438247
181
+ },
182
+ {
183
+ "current_steps": 1983,
184
+ "loss": 2.3134,
185
+ "learning_rate": 0.000925257731958763,
186
+ "epoch": 0.7904382470119522
187
+ },
188
+ {
189
+ "current_steps": 2047,
190
+ "loss": 2.3514,
191
+ "learning_rate": 0.0009226804123711341,
192
+ "epoch": 0.8159362549800797
193
+ },
194
+ {
195
+ "current_steps": 2111,
196
+ "loss": 2.3011,
197
+ "learning_rate": 0.0009201030927835051,
198
+ "epoch": 0.8414342629482072
199
+ },
200
+ {
201
+ "current_steps": 2175,
202
+ "loss": 2.4016,
203
+ "learning_rate": 0.0009175257731958762,
204
+ "epoch": 0.8669322709163346
205
+ },
206
+ {
207
+ "current_steps": 2239,
208
+ "loss": 2.3209,
209
+ "learning_rate": 0.0009149484536082474,
210
+ "epoch": 0.8924302788844621
211
+ },
212
+ {
213
+ "current_steps": 2303,
214
+ "loss": 2.3672,
215
+ "learning_rate": 0.0009123711340206185,
216
+ "epoch": 0.9179282868525896
217
+ },
218
+ {
219
+ "current_steps": 2367,
220
+ "loss": 2.3597,
221
+ "learning_rate": 0.0009097938144329897,
222
+ "epoch": 0.9434262948207172
223
+ },
224
+ {
225
+ "current_steps": 2431,
226
+ "loss": 2.3619,
227
+ "learning_rate": 0.0009072164948453608,
228
+ "epoch": 0.9689243027888447
229
+ },
230
+ {
231
+ "current_steps": 2495,
232
+ "loss": 2.3521,
233
+ "learning_rate": 0.000904639175257732,
234
+ "epoch": 0.9944223107569721
235
+ },
236
+ {
237
+ "current_steps": 2545,
238
+ "loss": 2.2467,
239
+ "learning_rate": 0.0009020618556701031,
240
+ "epoch": 1.0199203187250996
241
+ },
242
+ {
243
+ "current_steps": 2609,
244
+ "loss": 2.2082,
245
+ "learning_rate": 0.0008994845360824743,
246
+ "epoch": 1.045418326693227
247
+ },
248
+ {
249
+ "current_steps": 2673,
250
+ "loss": 2.2161,
251
+ "learning_rate": 0.0008969072164948454,
252
+ "epoch": 1.0709163346613546
253
+ },
254
+ {
255
+ "current_steps": 2737,
256
+ "loss": 2.1961,
257
+ "learning_rate": 0.0008943298969072166,
258
+ "epoch": 1.0964143426294821
259
+ },
260
+ {
261
+ "current_steps": 2801,
262
+ "loss": 2.215,
263
+ "learning_rate": 0.0008917525773195877,
264
+ "epoch": 1.1219123505976096
265
+ },
266
+ {
267
+ "current_steps": 2865,
268
+ "loss": 2.1951,
269
+ "learning_rate": 0.0008891752577319587,
270
+ "epoch": 1.1474103585657371
271
+ },
272
+ {
273
+ "current_steps": 2929,
274
+ "loss": 2.1599,
275
+ "learning_rate": 0.0008865979381443298,
276
+ "epoch": 1.1729083665338647
277
+ },
278
+ {
279
+ "current_steps": 2993,
280
+ "loss": 2.1511,
281
+ "learning_rate": 0.000884020618556701,
282
+ "epoch": 1.198406374501992
283
+ },
284
+ {
285
+ "current_steps": 3057,
286
+ "loss": 2.0713,
287
+ "learning_rate": 0.0008814432989690721,
288
+ "epoch": 1.2239043824701195
289
+ },
290
+ {
291
+ "current_steps": 3121,
292
+ "loss": 2.1312,
293
+ "learning_rate": 0.0008788659793814433,
294
+ "epoch": 1.249402390438247
295
+ },
296
+ {
297
+ "current_steps": 3185,
298
+ "loss": 2.1442,
299
+ "learning_rate": 0.0008762886597938144,
300
+ "epoch": 1.2749003984063745
301
+ },
302
+ {
303
+ "current_steps": 3249,
304
+ "loss": 2.141,
305
+ "learning_rate": 0.0008737113402061856,
306
+ "epoch": 1.300398406374502
307
+ },
308
+ {
309
+ "current_steps": 3313,
310
+ "loss": 2.1281,
311
+ "learning_rate": 0.0008711340206185567,
312
+ "epoch": 1.3258964143426295
313
+ },
314
+ {
315
+ "current_steps": 3377,
316
+ "loss": 2.1373,
317
+ "learning_rate": 0.0008685567010309279,
318
+ "epoch": 1.351394422310757
319
+ },
320
+ {
321
+ "current_steps": 3441,
322
+ "loss": 2.1111,
323
+ "learning_rate": 0.000865979381443299,
324
+ "epoch": 1.3768924302788845
325
+ },
326
+ {
327
+ "current_steps": 3505,
328
+ "loss": 2.0973,
329
+ "learning_rate": 0.0008634020618556702,
330
+ "epoch": 1.402390438247012
331
+ },
332
+ {
333
+ "current_steps": 3569,
334
+ "loss": 2.1299,
335
+ "learning_rate": 0.0008608247422680414,
336
+ "epoch": 1.4278884462151393
337
+ },
338
+ {
339
+ "current_steps": 3633,
340
+ "loss": 2.0971,
341
+ "learning_rate": 0.0008582474226804123,
342
+ "epoch": 1.453386454183267
343
+ },
344
+ {
345
+ "current_steps": 3697,
346
+ "loss": 2.0334,
347
+ "learning_rate": 0.0008556701030927834,
348
+ "epoch": 1.4788844621513944
349
+ },
350
+ {
351
+ "current_steps": 3761,
352
+ "loss": 2.0659,
353
+ "learning_rate": 0.0008530927835051546,
354
+ "epoch": 1.5043824701195219
355
+ },
356
+ {
357
+ "current_steps": 3825,
358
+ "loss": 2.0648,
359
+ "learning_rate": 0.0008505154639175257,
360
+ "epoch": 1.5298804780876494
361
+ },
362
+ {
363
+ "current_steps": 3889,
364
+ "loss": 2.0629,
365
+ "learning_rate": 0.0008479381443298969,
366
+ "epoch": 1.5553784860557769
367
+ },
368
+ {
369
+ "current_steps": 3953,
370
+ "loss": 2.0136,
371
+ "learning_rate": 0.000845360824742268,
372
+ "epoch": 1.5808764940239044
373
+ },
374
+ {
375
+ "current_steps": 4017,
376
+ "loss": 2.0692,
377
+ "learning_rate": 0.0008427835051546392,
378
+ "epoch": 1.606374501992032
379
+ },
380
+ {
381
+ "current_steps": 4081,
382
+ "loss": 2.1114,
383
+ "learning_rate": 0.0008402061855670104,
384
+ "epoch": 1.6318725099601594
385
+ },
386
+ {
387
+ "current_steps": 4145,
388
+ "loss": 2.048,
389
+ "learning_rate": 0.0008376288659793815,
390
+ "epoch": 1.6573705179282867
391
+ },
392
+ {
393
+ "current_steps": 4209,
394
+ "loss": 2.1087,
395
+ "learning_rate": 0.0008350515463917527,
396
+ "epoch": 1.6828685258964144
397
+ },
398
+ {
399
+ "current_steps": 4273,
400
+ "loss": 2.0474,
401
+ "learning_rate": 0.0008324742268041238,
402
+ "epoch": 1.7083665338645417
403
+ },
404
+ {
405
+ "current_steps": 4337,
406
+ "loss": 2.0699,
407
+ "learning_rate": 0.000829896907216495,
408
+ "epoch": 1.7338645418326695
409
+ },
410
+ {
411
+ "current_steps": 4401,
412
+ "loss": 2.0901,
413
+ "learning_rate": 0.0008273195876288659,
414
+ "epoch": 1.7593625498007968
415
+ },
416
+ {
417
+ "current_steps": 4465,
418
+ "loss": 2.018,
419
+ "learning_rate": 0.000824742268041237,
420
+ "epoch": 1.7848605577689243
421
+ },
422
+ {
423
+ "current_steps": 4529,
424
+ "loss": 2.039,
425
+ "learning_rate": 0.0008221649484536082,
426
+ "epoch": 1.8103585657370518
427
+ },
428
+ {
429
+ "current_steps": 4593,
430
+ "loss": 2.0168,
431
+ "learning_rate": 0.0008195876288659793,
432
+ "epoch": 1.8358565737051793
433
+ },
434
+ {
435
+ "current_steps": 4657,
436
+ "loss": 1.9646,
437
+ "learning_rate": 0.0008170103092783505,
438
+ "epoch": 1.8613545816733068
439
+ },
440
+ {
441
+ "current_steps": 4721,
442
+ "loss": 1.9875,
443
+ "learning_rate": 0.0008144329896907217,
444
+ "epoch": 1.886852589641434
445
+ },
446
+ {
447
+ "current_steps": 4785,
448
+ "loss": 2.0373,
449
+ "learning_rate": 0.0008118556701030928,
450
+ "epoch": 1.9123505976095618
451
+ },
452
+ {
453
+ "current_steps": 4849,
454
+ "loss": 1.9158,
455
+ "learning_rate": 0.000809278350515464,
456
+ "epoch": 1.9378486055776891
457
+ },
458
+ {
459
+ "current_steps": 4913,
460
+ "loss": 1.9173,
461
+ "learning_rate": 0.0008067010309278351,
462
+ "epoch": 1.9633466135458169
463
+ },
464
+ {
465
+ "current_steps": 4977,
466
+ "loss": 1.9941,
467
+ "learning_rate": 0.0008041237113402063,
468
+ "epoch": 1.9888446215139441
469
+ },
470
+ {
471
+ "current_steps": 5027,
472
+ "loss": 1.8231,
473
+ "learning_rate": 0.0008015463917525774,
474
+ "epoch": 2.014342629482072
475
+ },
476
+ {
477
+ "current_steps": 5091,
478
+ "loss": 1.8018,
479
+ "learning_rate": 0.0007989690721649486,
480
+ "epoch": 2.039840637450199
481
+ },
482
+ {
483
+ "current_steps": 5155,
484
+ "loss": 1.7098,
485
+ "learning_rate": 0.0007963917525773195,
486
+ "epoch": 2.065338645418327
487
+ },
488
+ {
489
+ "current_steps": 5219,
490
+ "loss": 1.7214,
491
+ "learning_rate": 0.0007938144329896907,
492
+ "epoch": 2.090836653386454
493
+ },
494
+ {
495
+ "current_steps": 5283,
496
+ "loss": 1.8029,
497
+ "learning_rate": 0.0007912371134020618,
498
+ "epoch": 2.1163346613545815
499
+ },
500
+ {
501
+ "current_steps": 5347,
502
+ "loss": 1.71,
503
+ "learning_rate": 0.000788659793814433,
504
+ "epoch": 2.141832669322709
505
+ },
506
+ {
507
+ "current_steps": 5411,
508
+ "loss": 1.725,
509
+ "learning_rate": 0.0007860824742268041,
510
+ "epoch": 2.1673306772908365
511
+ },
512
+ {
513
+ "current_steps": 5475,
514
+ "loss": 1.729,
515
+ "learning_rate": 0.0007835051546391753,
516
+ "epoch": 2.1928286852589642
517
+ },
518
+ {
519
+ "current_steps": 5539,
520
+ "loss": 1.7139,
521
+ "learning_rate": 0.0007809278350515464,
522
+ "epoch": 2.2183266932270915
523
+ },
524
+ {
525
+ "current_steps": 5603,
526
+ "loss": 1.6588,
527
+ "learning_rate": 0.0007783505154639176,
528
+ "epoch": 2.2438247011952193
529
+ },
530
+ {
531
+ "current_steps": 5667,
532
+ "loss": 1.7179,
533
+ "learning_rate": 0.0007757731958762887,
534
+ "epoch": 2.2693227091633466
535
+ },
536
+ {
537
+ "current_steps": 5731,
538
+ "loss": 1.7024,
539
+ "learning_rate": 0.0007731958762886599,
540
+ "epoch": 2.2948207171314743
541
+ },
542
+ {
543
+ "current_steps": 5795,
544
+ "loss": 1.7205,
545
+ "learning_rate": 0.000770618556701031,
546
+ "epoch": 2.3203187250996016
547
+ },
548
+ {
549
+ "current_steps": 5859,
550
+ "loss": 1.7094,
551
+ "learning_rate": 0.0007680412371134022,
552
+ "epoch": 2.3458167330677293
553
+ },
554
+ {
555
+ "current_steps": 5923,
556
+ "loss": 1.7152,
557
+ "learning_rate": 0.0007654639175257731,
558
+ "epoch": 2.3713147410358566
559
+ },
560
+ {
561
+ "current_steps": 5987,
562
+ "loss": 1.7285,
563
+ "learning_rate": 0.0007628865979381443,
564
+ "epoch": 2.396812749003984
565
+ },
566
+ {
567
+ "current_steps": 6051,
568
+ "loss": 1.6915,
569
+ "learning_rate": 0.0007603092783505154,
570
+ "epoch": 2.4223107569721116
571
+ },
572
+ {
573
+ "current_steps": 6115,
574
+ "loss": 1.6536,
575
+ "learning_rate": 0.0007577319587628866,
576
+ "epoch": 2.447808764940239
577
+ },
578
+ {
579
+ "current_steps": 6179,
580
+ "loss": 1.6924,
581
+ "learning_rate": 0.0007551546391752577,
582
+ "epoch": 2.4733067729083666
583
+ },
584
+ {
585
+ "current_steps": 6243,
586
+ "loss": 1.6518,
587
+ "learning_rate": 0.0007525773195876289,
588
+ "epoch": 2.498804780876494
589
+ },
590
+ {
591
+ "current_steps": 6307,
592
+ "loss": 1.6765,
593
+ "learning_rate": 0.00075,
594
+ "epoch": 2.5243027888446217
595
+ },
596
+ {
597
+ "current_steps": 6371,
598
+ "loss": 1.6662,
599
+ "learning_rate": 0.0007474226804123712,
600
+ "epoch": 2.549800796812749
601
+ },
602
+ {
603
+ "current_steps": 6435,
604
+ "loss": 1.6179,
605
+ "learning_rate": 0.0007448453608247423,
606
+ "epoch": 2.5752988047808767
607
+ },
608
+ {
609
+ "current_steps": 6499,
610
+ "loss": 1.6485,
611
+ "learning_rate": 0.0007422680412371135,
612
+ "epoch": 2.600796812749004
613
+ },
614
+ {
615
+ "current_steps": 6563,
616
+ "loss": 1.7025,
617
+ "learning_rate": 0.0007396907216494846,
618
+ "epoch": 2.6262948207171313
619
+ },
620
+ {
621
+ "current_steps": 6627,
622
+ "loss": 1.6723,
623
+ "learning_rate": 0.0007371134020618558,
624
+ "epoch": 2.651792828685259
625
+ },
626
+ {
627
+ "current_steps": 6691,
628
+ "loss": 1.7608,
629
+ "learning_rate": 0.0007345360824742269,
630
+ "epoch": 2.6772908366533863
631
+ },
632
+ {
633
+ "current_steps": 6755,
634
+ "loss": 1.6388,
635
+ "learning_rate": 0.0007319587628865979,
636
+ "epoch": 2.702788844621514
637
+ },
638
+ {
639
+ "current_steps": 6819,
640
+ "loss": 1.6381,
641
+ "learning_rate": 0.000729381443298969,
642
+ "epoch": 2.7282868525896413
643
+ },
644
+ {
645
+ "current_steps": 6883,
646
+ "loss": 1.6857,
647
+ "learning_rate": 0.0007268041237113402,
648
+ "epoch": 2.753784860557769
649
+ },
650
+ {
651
+ "current_steps": 6947,
652
+ "loss": 1.6556,
653
+ "learning_rate": 0.0007242268041237113,
654
+ "epoch": 2.7792828685258963
655
+ },
656
+ {
657
+ "current_steps": 7011,
658
+ "loss": 1.6446,
659
+ "learning_rate": 0.0007216494845360825,
660
+ "epoch": 2.804780876494024
661
+ },
662
+ {
663
+ "current_steps": 7075,
664
+ "loss": 1.593,
665
+ "learning_rate": 0.0007190721649484536,
666
+ "epoch": 2.8302788844621514
667
+ },
668
+ {
669
+ "current_steps": 7139,
670
+ "loss": 1.6836,
671
+ "learning_rate": 0.0007164948453608248,
672
+ "epoch": 2.8557768924302787
673
+ },
674
+ {
675
+ "current_steps": 7203,
676
+ "loss": 1.5774,
677
+ "learning_rate": 0.0007139175257731959,
678
+ "epoch": 2.8812749003984064
679
+ },
680
+ {
681
+ "current_steps": 7267,
682
+ "loss": 1.6251,
683
+ "learning_rate": 0.0007113402061855671,
684
+ "epoch": 2.906772908366534
685
+ },
686
+ {
687
+ "current_steps": 7331,
688
+ "loss": 1.6695,
689
+ "learning_rate": 0.0007087628865979382,
690
+ "epoch": 2.9322709163346614
691
+ },
692
+ {
693
+ "current_steps": 7395,
694
+ "loss": 1.6645,
695
+ "learning_rate": 0.0007061855670103094,
696
+ "epoch": 2.9577689243027887
697
+ },
698
+ {
699
+ "current_steps": 7459,
700
+ "loss": 1.6589,
701
+ "learning_rate": 0.0007036082474226805,
702
+ "epoch": 2.9832669322709164
703
+ },
704
+ {
705
+ "current_steps": 7509,
706
+ "loss": 1.4876,
707
+ "learning_rate": 0.0007010309278350515,
708
+ "epoch": 3.0087649402390437
709
+ },
710
+ {
711
+ "current_steps": 7573,
712
+ "loss": 1.3509,
713
+ "learning_rate": 0.0006984536082474226,
714
+ "epoch": 3.0342629482071715
715
+ },
716
+ {
717
+ "current_steps": 7637,
718
+ "loss": 1.3058,
719
+ "learning_rate": 0.0006958762886597938,
720
+ "epoch": 3.0597609561752988
721
+ },
722
+ {
723
+ "current_steps": 7701,
724
+ "loss": 1.4409,
725
+ "learning_rate": 0.0006932989690721649,
726
+ "epoch": 3.0852589641434265
727
+ },
728
+ {
729
+ "current_steps": 7765,
730
+ "loss": 1.3829,
731
+ "learning_rate": 0.0006907216494845361,
732
+ "epoch": 3.1107569721115538
733
+ },
734
+ {
735
+ "current_steps": 7829,
736
+ "loss": 1.3513,
737
+ "learning_rate": 0.0006881443298969072,
738
+ "epoch": 3.1362549800796815
739
+ },
740
+ {
741
+ "current_steps": 7893,
742
+ "loss": 1.3223,
743
+ "learning_rate": 0.0006855670103092784,
744
+ "epoch": 3.161752988047809
745
+ },
746
+ {
747
+ "current_steps": 7957,
748
+ "loss": 1.2705,
749
+ "learning_rate": 0.0006829896907216495,
750
+ "epoch": 3.187250996015936
751
+ },
752
+ {
753
+ "current_steps": 8021,
754
+ "loss": 1.3133,
755
+ "learning_rate": 0.0006804123711340207,
756
+ "epoch": 3.212749003984064
757
+ },
758
+ {
759
+ "current_steps": 8085,
760
+ "loss": 1.3229,
761
+ "learning_rate": 0.0006778350515463918,
762
+ "epoch": 3.238247011952191
763
+ },
764
+ {
765
+ "current_steps": 8149,
766
+ "loss": 1.3097,
767
+ "learning_rate": 0.000675257731958763,
768
+ "epoch": 3.263745019920319
769
+ },
770
+ {
771
+ "current_steps": 8213,
772
+ "loss": 1.2961,
773
+ "learning_rate": 0.0006726804123711341,
774
+ "epoch": 3.289243027888446
775
+ },
776
+ {
777
+ "current_steps": 8277,
778
+ "loss": 1.3048,
779
+ "learning_rate": 0.0006701030927835051,
780
+ "epoch": 3.314741035856574
781
+ },
782
+ {
783
+ "current_steps": 8341,
784
+ "loss": 1.2909,
785
+ "learning_rate": 0.0006675257731958762,
786
+ "epoch": 3.340239043824701
787
+ },
788
+ {
789
+ "current_steps": 8405,
790
+ "loss": 1.3333,
791
+ "learning_rate": 0.0006649484536082474,
792
+ "epoch": 3.365737051792829
793
+ },
794
+ {
795
+ "current_steps": 8469,
796
+ "loss": 1.2552,
797
+ "learning_rate": 0.0006623711340206185,
798
+ "epoch": 3.391235059760956
799
+ },
800
+ {
801
+ "current_steps": 8533,
802
+ "loss": 1.306,
803
+ "learning_rate": 0.0006597938144329897,
804
+ "epoch": 3.4167330677290835
805
+ },
806
+ {
807
+ "current_steps": 8597,
808
+ "loss": 1.2382,
809
+ "learning_rate": 0.0006572164948453608,
810
+ "epoch": 3.442231075697211
811
+ },
812
+ {
813
+ "current_steps": 8661,
814
+ "loss": 1.2718,
815
+ "learning_rate": 0.000654639175257732,
816
+ "epoch": 3.4677290836653385
817
+ },
818
+ {
819
+ "current_steps": 8725,
820
+ "loss": 1.2348,
821
+ "learning_rate": 0.0006520618556701031,
822
+ "epoch": 3.4932270916334662
823
+ },
824
+ {
825
+ "current_steps": 8789,
826
+ "loss": 1.2724,
827
+ "learning_rate": 0.0006494845360824743,
828
+ "epoch": 3.5187250996015935
829
+ },
830
+ {
831
+ "current_steps": 8853,
832
+ "loss": 1.244,
833
+ "learning_rate": 0.0006469072164948454,
834
+ "epoch": 3.5442231075697213
835
+ },
836
+ {
837
+ "current_steps": 8917,
838
+ "loss": 1.2948,
839
+ "learning_rate": 0.0006443298969072166,
840
+ "epoch": 3.5697211155378485
841
+ },
842
+ {
843
+ "current_steps": 8981,
844
+ "loss": 1.2063,
845
+ "learning_rate": 0.0006417525773195877,
846
+ "epoch": 3.5952191235059763
847
+ },
848
+ {
849
+ "current_steps": 9045,
850
+ "loss": 1.2656,
851
+ "learning_rate": 0.0006391752577319587,
852
+ "epoch": 3.6207171314741036
853
+ },
854
+ {
855
+ "current_steps": 9109,
856
+ "loss": 1.25,
857
+ "learning_rate": 0.0006365979381443298,
858
+ "epoch": 3.646215139442231
859
+ },
860
+ {
861
+ "current_steps": 9173,
862
+ "loss": 1.3042,
863
+ "learning_rate": 0.000634020618556701,
864
+ "epoch": 3.6717131474103586
865
+ },
866
+ {
867
+ "current_steps": 9237,
868
+ "loss": 1.2612,
869
+ "learning_rate": 0.0006314432989690721,
870
+ "epoch": 3.6972111553784863
871
+ },
872
+ {
873
+ "current_steps": 9301,
874
+ "loss": 1.2516,
875
+ "learning_rate": 0.0006288659793814433,
876
+ "epoch": 3.7227091633466136
877
+ },
878
+ {
879
+ "current_steps": 9365,
880
+ "loss": 1.2572,
881
+ "learning_rate": 0.0006262886597938144,
882
+ "epoch": 3.748207171314741
883
+ },
884
+ {
885
+ "current_steps": 9429,
886
+ "loss": 1.2525,
887
+ "learning_rate": 0.0006237113402061856,
888
+ "epoch": 3.7737051792828686
889
+ },
890
+ {
891
+ "current_steps": 9493,
892
+ "loss": 1.2509,
893
+ "learning_rate": 0.0006211340206185567,
894
+ "epoch": 3.799203187250996
895
+ },
896
+ {
897
+ "current_steps": 9557,
898
+ "loss": 1.2467,
899
+ "learning_rate": 0.0006185567010309279,
900
+ "epoch": 3.8247011952191237
901
+ },
902
+ {
903
+ "current_steps": 9621,
904
+ "loss": 1.2375,
905
+ "learning_rate": 0.000615979381443299,
906
+ "epoch": 3.850199203187251
907
+ },
908
+ {
909
+ "current_steps": 9685,
910
+ "loss": 1.2406,
911
+ "learning_rate": 0.0006134020618556702,
912
+ "epoch": 3.8756972111553782
913
+ },
914
+ {
915
+ "current_steps": 9749,
916
+ "loss": 1.2517,
917
+ "learning_rate": 0.0006108247422680413,
918
+ "epoch": 3.901195219123506
919
+ },
920
+ {
921
+ "current_steps": 9813,
922
+ "loss": 1.2533,
923
+ "learning_rate": 0.0006082474226804123,
924
+ "epoch": 3.9266932270916337
925
+ },
926
+ {
927
+ "current_steps": 9877,
928
+ "loss": 1.256,
929
+ "learning_rate": 0.0006056701030927834,
930
+ "epoch": 3.952191235059761
931
+ },
932
+ {
933
+ "current_steps": 9941,
934
+ "loss": 1.2826,
935
+ "learning_rate": 0.0006030927835051546,
936
+ "epoch": 3.9776892430278883
937
+ },
938
+ {
939
+ "current_steps": 9991,
940
+ "loss": 1.226,
941
+ "learning_rate": 0.0006005154639175257,
942
+ "epoch": 4.003187250996016
943
+ },
944
+ {
945
+ "current_steps": 10055,
946
+ "loss": 0.9512,
947
+ "learning_rate": 0.0005979381443298969,
948
+ "epoch": 4.028685258964144
949
+ },
950
+ {
951
+ "current_steps": 10055,
952
+ "loss": 0.9512,
953
+ "learning_rate": 0.0005979381443298969,
954
+ "epoch": 4.028685258964144
955
+ }
956
+ ]
training_graph.png ADDED
training_log.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model_name": "meta-llama_Meta-Llama-3-8B",
3
+ "base_model_class": "LlamaForCausalLM",
4
+ "base_loaded_in_4bit": false,
5
+ "base_loaded_in_8bit": false,
6
+ "projections": "q, v",
7
+ "loss": 0.9512,
8
+ "grad_norm": 0.6213752031326294,
9
+ "learning_rate": 0.0005979381443298969,
10
+ "epoch": 4.028685258964144,
11
+ "current_steps": 10055,
12
+ "current_steps_adjusted": 10055,
13
+ "epoch_adjusted": 4.028685258964144,
14
+ "train_runtime": 3864.6216,
15
+ "train_samples_per_second": 25.977,
16
+ "train_steps_per_second": 0.101,
17
+ "total_flos": 4.69608070712918e+17,
18
+ "train_loss": 1.8759732170950008
19
+ }
training_parameters.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "lora_name": "lora-meta-llama_Meta-Llama-3-8B-16bit-ORKL-cleaned_juergen-v3_LR_1e-3",
3
+ "always_override": true,
4
+ "save_steps": 722,
5
+ "micro_batch_size": 4,
6
+ "batch_size": 0,
7
+ "epochs": 10,
8
+ "learning_rate": "1e-3",
9
+ "lr_scheduler_type": "linear",
10
+ "lora_rank": 128,
11
+ "lora_alpha": 256,
12
+ "lora_dropout": 0.05,
13
+ "cutoff_len": 256,
14
+ "dataset": "None",
15
+ "eval_dataset": "None",
16
+ "format": "None",
17
+ "eval_steps": 100,
18
+ "raw_text_file": "orkl-cleaned-juergen",
19
+ "higher_rank_limit": false,
20
+ "warmup_steps": 128,
21
+ "optimizer": "adamw_torch",
22
+ "hard_cut_string": "\\n\\n\\n",
23
+ "train_only_after": "",
24
+ "stop_at_loss": 1,
25
+ "add_eos_token": false,
26
+ "min_chars": 3,
27
+ "report_to": "wandb",
28
+ "precize_slicing_overlap": true,
29
+ "add_eos_token_type": "Every Block",
30
+ "save_steps_under_loss": 1.8,
31
+ "add_bos_token": true,
32
+ "training_projection": "q-v",
33
+ "sliding_window": false,
34
+ "warmup_ratio": 0,
35
+ "grad_accumulation": 64,
36
+ "neft_noise_alpha": 0
37
+ }
training_prompt.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "template_type": "raw_text"
3
+ }