PEFT
Safetensors
parikshitmukh's picture
Upload folder using huggingface_hub
a4f4685 verified
{
"best_metric": 1.3700028657913208,
"best_model_checkpoint": "/app/finetuned_weights/checkpoint-800",
"epoch": 0.4591434105746467,
"eval_steps": 100,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005739292632183084,
"grad_norm": 0.43322139978408813,
"learning_rate": 0.0002,
"loss": 2.051,
"mean_token_accuracy": 0.6683308390900493,
"step": 10
},
{
"epoch": 0.011478585264366167,
"grad_norm": 0.40237197279930115,
"learning_rate": 0.0002,
"loss": 1.7549,
"mean_token_accuracy": 0.6822433151304722,
"step": 20
},
{
"epoch": 0.01721787789654925,
"grad_norm": 0.43123266100883484,
"learning_rate": 0.0002,
"loss": 1.5103,
"mean_token_accuracy": 0.7066726513206959,
"step": 30
},
{
"epoch": 0.022957170528732335,
"grad_norm": 0.16466361284255981,
"learning_rate": 0.0002,
"loss": 1.4444,
"mean_token_accuracy": 0.7159745823591948,
"step": 40
},
{
"epoch": 0.02869646316091542,
"grad_norm": 0.3186506927013397,
"learning_rate": 0.0002,
"loss": 1.4808,
"mean_token_accuracy": 0.7143966030329466,
"step": 50
},
{
"epoch": 0.0344357557930985,
"grad_norm": 0.49696969985961914,
"learning_rate": 0.0002,
"loss": 1.5015,
"mean_token_accuracy": 0.7038091894239187,
"step": 60
},
{
"epoch": 0.040175048425281586,
"grad_norm": 0.2998158037662506,
"learning_rate": 0.0002,
"loss": 1.345,
"mean_token_accuracy": 0.7281523209065199,
"step": 70
},
{
"epoch": 0.04591434105746467,
"grad_norm": 0.2879635989665985,
"learning_rate": 0.0002,
"loss": 1.4034,
"mean_token_accuracy": 0.7154877178370953,
"step": 80
},
{
"epoch": 0.05165363368964775,
"grad_norm": 0.16925716400146484,
"learning_rate": 0.0002,
"loss": 1.6293,
"mean_token_accuracy": 0.6787833951413631,
"step": 90
},
{
"epoch": 0.05739292632183084,
"grad_norm": 0.2507581412792206,
"learning_rate": 0.0002,
"loss": 1.3949,
"mean_token_accuracy": 0.7227997560054064,
"step": 100
},
{
"epoch": 0.05739292632183084,
"eval_loss": 1.4279941320419312,
"eval_mean_token_accuracy": 0.7138351793184543,
"eval_runtime": 4534.8411,
"eval_samples_per_second": 0.768,
"eval_steps_per_second": 0.384,
"step": 100
},
{
"epoch": 0.06313221895401391,
"grad_norm": 0.24556811153888702,
"learning_rate": 0.0002,
"loss": 1.3832,
"mean_token_accuracy": 0.7172763034701347,
"step": 110
},
{
"epoch": 0.068871511586197,
"grad_norm": 0.24670149385929108,
"learning_rate": 0.0002,
"loss": 1.4245,
"mean_token_accuracy": 0.7150326510891318,
"step": 120
},
{
"epoch": 0.07461080421838008,
"grad_norm": 0.27052056789398193,
"learning_rate": 0.0002,
"loss": 1.2266,
"mean_token_accuracy": 0.7490846037864685,
"step": 130
},
{
"epoch": 0.08035009685056317,
"grad_norm": 0.2287702113389969,
"learning_rate": 0.0002,
"loss": 1.4237,
"mean_token_accuracy": 0.7114990293979645,
"step": 140
},
{
"epoch": 0.08608938948274625,
"grad_norm": 0.2251950353384018,
"learning_rate": 0.0002,
"loss": 1.4939,
"mean_token_accuracy": 0.7060536827892065,
"step": 150
},
{
"epoch": 0.09182868211492934,
"grad_norm": 0.2458341121673584,
"learning_rate": 0.0002,
"loss": 1.4142,
"mean_token_accuracy": 0.7157001797109842,
"step": 160
},
{
"epoch": 0.09756797474711242,
"grad_norm": 0.1824209988117218,
"learning_rate": 0.0002,
"loss": 1.2918,
"mean_token_accuracy": 0.7333439949899911,
"step": 170
},
{
"epoch": 0.1033072673792955,
"grad_norm": 0.23465971648693085,
"learning_rate": 0.0002,
"loss": 1.3675,
"mean_token_accuracy": 0.7236015398055315,
"step": 180
},
{
"epoch": 0.10904656001147858,
"grad_norm": 0.2167435586452484,
"learning_rate": 0.0002,
"loss": 1.4979,
"mean_token_accuracy": 0.7010378727689386,
"step": 190
},
{
"epoch": 0.11478585264366167,
"grad_norm": 0.24258023500442505,
"learning_rate": 0.0002,
"loss": 1.3437,
"mean_token_accuracy": 0.7294836457818746,
"step": 200
},
{
"epoch": 0.11478585264366167,
"eval_loss": 1.402819037437439,
"eval_mean_token_accuracy": 0.716681150906536,
"eval_runtime": 4414.3312,
"eval_samples_per_second": 0.789,
"eval_steps_per_second": 0.395,
"step": 200
},
{
"epoch": 0.12052514527584475,
"grad_norm": 0.24051423370838165,
"learning_rate": 0.0002,
"loss": 1.3826,
"mean_token_accuracy": 0.7158944692462683,
"step": 210
},
{
"epoch": 0.12626443790802783,
"grad_norm": 0.25226420164108276,
"learning_rate": 0.0002,
"loss": 1.2985,
"mean_token_accuracy": 0.733695725724101,
"step": 220
},
{
"epoch": 0.13200373054021092,
"grad_norm": 0.2106948047876358,
"learning_rate": 0.0002,
"loss": 1.3394,
"mean_token_accuracy": 0.7303649850189686,
"step": 230
},
{
"epoch": 0.137743023172394,
"grad_norm": 0.21339824795722961,
"learning_rate": 0.0002,
"loss": 1.3752,
"mean_token_accuracy": 0.7253331538289786,
"step": 240
},
{
"epoch": 0.1434823158045771,
"grad_norm": 0.2484087496995926,
"learning_rate": 0.0002,
"loss": 1.4659,
"mean_token_accuracy": 0.7037245020270347,
"step": 250
},
{
"epoch": 0.14922160843676016,
"grad_norm": 0.24411025643348694,
"learning_rate": 0.0002,
"loss": 1.3957,
"mean_token_accuracy": 0.7138343520462513,
"step": 260
},
{
"epoch": 0.15496090106894325,
"grad_norm": 0.2551439702510834,
"learning_rate": 0.0002,
"loss": 1.3914,
"mean_token_accuracy": 0.713682159781456,
"step": 270
},
{
"epoch": 0.16070019370112634,
"grad_norm": 0.258771151304245,
"learning_rate": 0.0002,
"loss": 1.4153,
"mean_token_accuracy": 0.7100905137136578,
"step": 280
},
{
"epoch": 0.16643948633330943,
"grad_norm": 0.20730619132518768,
"learning_rate": 0.0002,
"loss": 1.3201,
"mean_token_accuracy": 0.733038941025734,
"step": 290
},
{
"epoch": 0.1721787789654925,
"grad_norm": 0.17945091426372528,
"learning_rate": 0.0002,
"loss": 1.5173,
"mean_token_accuracy": 0.6967695135623216,
"step": 300
},
{
"epoch": 0.1721787789654925,
"eval_loss": 1.3912384510040283,
"eval_mean_token_accuracy": 0.7176746699191754,
"eval_runtime": 4400.1017,
"eval_samples_per_second": 0.792,
"eval_steps_per_second": 0.396,
"step": 300
},
{
"epoch": 0.1779180715976756,
"grad_norm": 0.25128230452537537,
"learning_rate": 0.0002,
"loss": 1.4181,
"mean_token_accuracy": 0.715370923653245,
"step": 310
},
{
"epoch": 0.18365736422985868,
"grad_norm": 0.22883553802967072,
"learning_rate": 0.0002,
"loss": 1.4089,
"mean_token_accuracy": 0.718804694339633,
"step": 320
},
{
"epoch": 0.18939665686204174,
"grad_norm": 0.2196984440088272,
"learning_rate": 0.0002,
"loss": 1.2547,
"mean_token_accuracy": 0.7411983285099268,
"step": 330
},
{
"epoch": 0.19513594949422483,
"grad_norm": 0.374326229095459,
"learning_rate": 0.0002,
"loss": 1.552,
"mean_token_accuracy": 0.6875970430672169,
"step": 340
},
{
"epoch": 0.20087524212640792,
"grad_norm": 0.6579405665397644,
"learning_rate": 0.0002,
"loss": 1.3655,
"mean_token_accuracy": 0.7201284021139145,
"step": 350
},
{
"epoch": 0.206614534758591,
"grad_norm": 0.2102547287940979,
"learning_rate": 0.0002,
"loss": 1.3867,
"mean_token_accuracy": 0.7191131260246039,
"step": 360
},
{
"epoch": 0.21235382739077407,
"grad_norm": 0.26832231879234314,
"learning_rate": 0.0002,
"loss": 1.4089,
"mean_token_accuracy": 0.7181429363787174,
"step": 370
},
{
"epoch": 0.21809312002295717,
"grad_norm": 0.25602883100509644,
"learning_rate": 0.0002,
"loss": 1.3422,
"mean_token_accuracy": 0.7277179971337319,
"step": 380
},
{
"epoch": 0.22383241265514026,
"grad_norm": 0.2577485144138336,
"learning_rate": 0.0002,
"loss": 1.4422,
"mean_token_accuracy": 0.7109258253127336,
"step": 390
},
{
"epoch": 0.22957170528732335,
"grad_norm": 0.2750665247440338,
"learning_rate": 0.0002,
"loss": 1.2633,
"mean_token_accuracy": 0.7383943419903517,
"step": 400
},
{
"epoch": 0.22957170528732335,
"eval_loss": 1.38496732711792,
"eval_mean_token_accuracy": 0.7186302011869997,
"eval_runtime": 4400.5889,
"eval_samples_per_second": 0.792,
"eval_steps_per_second": 0.396,
"step": 400
},
{
"epoch": 0.2353109979195064,
"grad_norm": 0.23557531833648682,
"learning_rate": 0.0002,
"loss": 1.3842,
"mean_token_accuracy": 0.7180797912180423,
"step": 410
},
{
"epoch": 0.2410502905516895,
"grad_norm": 0.2660968601703644,
"learning_rate": 0.0002,
"loss": 1.401,
"mean_token_accuracy": 0.7145325090736151,
"step": 420
},
{
"epoch": 0.2467895831838726,
"grad_norm": 0.2272387444972992,
"learning_rate": 0.0002,
"loss": 1.4956,
"mean_token_accuracy": 0.6978571161627769,
"step": 430
},
{
"epoch": 0.25252887581605565,
"grad_norm": 0.2202438861131668,
"learning_rate": 0.0002,
"loss": 1.3757,
"mean_token_accuracy": 0.7206571504473687,
"step": 440
},
{
"epoch": 0.25826816844823874,
"grad_norm": 0.24659469723701477,
"learning_rate": 0.0002,
"loss": 1.388,
"mean_token_accuracy": 0.7190396279096604,
"step": 450
},
{
"epoch": 0.26400746108042183,
"grad_norm": 0.20384320616722107,
"learning_rate": 0.0002,
"loss": 1.2807,
"mean_token_accuracy": 0.7384566117078066,
"step": 460
},
{
"epoch": 0.2697467537126049,
"grad_norm": 0.2716342806816101,
"learning_rate": 0.0002,
"loss": 1.3157,
"mean_token_accuracy": 0.7307378999888897,
"step": 470
},
{
"epoch": 0.275486046344788,
"grad_norm": 0.2534655034542084,
"learning_rate": 0.0002,
"loss": 1.3959,
"mean_token_accuracy": 0.7178040158003569,
"step": 480
},
{
"epoch": 0.2812253389769711,
"grad_norm": 0.21825498342514038,
"learning_rate": 0.0002,
"loss": 1.387,
"mean_token_accuracy": 0.7204662635922432,
"step": 490
},
{
"epoch": 0.2869646316091542,
"grad_norm": 0.2534162402153015,
"learning_rate": 0.0002,
"loss": 1.2776,
"mean_token_accuracy": 0.7423365503549576,
"step": 500
},
{
"epoch": 0.2869646316091542,
"eval_loss": 1.3787877559661865,
"eval_mean_token_accuracy": 0.7194652643548437,
"eval_runtime": 7761.4408,
"eval_samples_per_second": 0.449,
"eval_steps_per_second": 0.225,
"step": 500
},
{
"epoch": 0.29270392424133723,
"grad_norm": 0.23998941481113434,
"learning_rate": 0.0002,
"loss": 1.2277,
"mean_token_accuracy": 0.7432656295597553,
"step": 510
},
{
"epoch": 0.2984432168735203,
"grad_norm": 0.23271049559116364,
"learning_rate": 0.0002,
"loss": 1.2776,
"mean_token_accuracy": 0.7337239418178797,
"step": 520
},
{
"epoch": 0.3041825095057034,
"grad_norm": 0.2755042016506195,
"learning_rate": 0.0002,
"loss": 1.4288,
"mean_token_accuracy": 0.7133219081908464,
"step": 530
},
{
"epoch": 0.3099218021378865,
"grad_norm": 0.21231453120708466,
"learning_rate": 0.0002,
"loss": 1.4229,
"mean_token_accuracy": 0.7119937628507614,
"step": 540
},
{
"epoch": 0.3156610947700696,
"grad_norm": 0.2159433215856552,
"learning_rate": 0.0002,
"loss": 1.3043,
"mean_token_accuracy": 0.7337923284620047,
"step": 550
},
{
"epoch": 0.3214003874022527,
"grad_norm": 0.238509863615036,
"learning_rate": 0.0002,
"loss": 1.3429,
"mean_token_accuracy": 0.7277234088629484,
"step": 560
},
{
"epoch": 0.3271396800344358,
"grad_norm": 0.27093520760536194,
"learning_rate": 0.0002,
"loss": 1.3089,
"mean_token_accuracy": 0.7319676581770181,
"step": 570
},
{
"epoch": 0.33287897266661887,
"grad_norm": 0.21662364900112152,
"learning_rate": 0.0002,
"loss": 1.3448,
"mean_token_accuracy": 0.72448665574193,
"step": 580
},
{
"epoch": 0.3386182652988019,
"grad_norm": 0.25919026136398315,
"learning_rate": 0.0002,
"loss": 1.3413,
"mean_token_accuracy": 0.7278214626014232,
"step": 590
},
{
"epoch": 0.344357557930985,
"grad_norm": 0.2097223550081253,
"learning_rate": 0.0002,
"loss": 1.3812,
"mean_token_accuracy": 0.7174393549561501,
"step": 600
},
{
"epoch": 0.344357557930985,
"eval_loss": 1.3745734691619873,
"eval_mean_token_accuracy": 0.7201109681998733,
"eval_runtime": 6592.6776,
"eval_samples_per_second": 0.529,
"eval_steps_per_second": 0.264,
"step": 600
},
{
"epoch": 0.3500968505631681,
"grad_norm": 0.27309486269950867,
"learning_rate": 0.0002,
"loss": 1.3764,
"mean_token_accuracy": 0.7236971091479063,
"step": 610
},
{
"epoch": 0.3558361431953512,
"grad_norm": 0.2800423204898834,
"learning_rate": 0.0002,
"loss": 1.4141,
"mean_token_accuracy": 0.7132530447095633,
"step": 620
},
{
"epoch": 0.36157543582753426,
"grad_norm": 0.32200849056243896,
"learning_rate": 0.0002,
"loss": 1.5291,
"mean_token_accuracy": 0.6955816943198443,
"step": 630
},
{
"epoch": 0.36731472845971735,
"grad_norm": 0.1762777417898178,
"learning_rate": 0.0002,
"loss": 1.2729,
"mean_token_accuracy": 0.7379875779151917,
"step": 640
},
{
"epoch": 0.37305402109190045,
"grad_norm": 0.27259498834609985,
"learning_rate": 0.0002,
"loss": 1.3315,
"mean_token_accuracy": 0.7254374325275421,
"step": 650
},
{
"epoch": 0.3787933137240835,
"grad_norm": 0.3148305118083954,
"learning_rate": 0.0002,
"loss": 1.4009,
"mean_token_accuracy": 0.7169190965592861,
"step": 660
},
{
"epoch": 0.38453260635626657,
"grad_norm": 0.2222924679517746,
"learning_rate": 0.0002,
"loss": 1.4422,
"mean_token_accuracy": 0.7057445451617241,
"step": 670
},
{
"epoch": 0.39027189898844966,
"grad_norm": 0.30782487988471985,
"learning_rate": 0.0002,
"loss": 1.3916,
"mean_token_accuracy": 0.7196735937148333,
"step": 680
},
{
"epoch": 0.39601119162063275,
"grad_norm": 0.24766255915164948,
"learning_rate": 0.0002,
"loss": 1.3191,
"mean_token_accuracy": 0.7313117351382971,
"step": 690
},
{
"epoch": 0.40175048425281584,
"grad_norm": 0.26929622888565063,
"learning_rate": 0.0002,
"loss": 1.3379,
"mean_token_accuracy": 0.7244091514497996,
"step": 700
},
{
"epoch": 0.40175048425281584,
"eval_loss": 1.370943307876587,
"eval_mean_token_accuracy": 0.720665924306649,
"eval_runtime": 4504.2557,
"eval_samples_per_second": 0.774,
"eval_steps_per_second": 0.387,
"step": 700
},
{
"epoch": 0.40748977688499893,
"grad_norm": 0.22724460065364838,
"learning_rate": 0.0002,
"loss": 1.3541,
"mean_token_accuracy": 0.7258114762604236,
"step": 710
},
{
"epoch": 0.413229069517182,
"grad_norm": 0.22957713901996613,
"learning_rate": 0.0002,
"loss": 1.3855,
"mean_token_accuracy": 0.7198538523167372,
"step": 720
},
{
"epoch": 0.4189683621493651,
"grad_norm": 0.23907890915870667,
"learning_rate": 0.0002,
"loss": 1.427,
"mean_token_accuracy": 0.7128315325826406,
"step": 730
},
{
"epoch": 0.42470765478154815,
"grad_norm": 0.2534020245075226,
"learning_rate": 0.0002,
"loss": 1.3615,
"mean_token_accuracy": 0.7198588822036982,
"step": 740
},
{
"epoch": 0.43044694741373124,
"grad_norm": 0.24605919420719147,
"learning_rate": 0.0002,
"loss": 1.3972,
"mean_token_accuracy": 0.7115087192505598,
"step": 750
},
{
"epoch": 0.43618624004591433,
"grad_norm": 0.2243734747171402,
"learning_rate": 0.0002,
"loss": 1.2523,
"mean_token_accuracy": 0.7429927971214056,
"step": 760
},
{
"epoch": 0.4419255326780974,
"grad_norm": 0.230802983045578,
"learning_rate": 0.0002,
"loss": 1.4611,
"mean_token_accuracy": 0.7064661320298911,
"step": 770
},
{
"epoch": 0.4476648253102805,
"grad_norm": 0.23596514761447906,
"learning_rate": 0.0002,
"loss": 1.3961,
"mean_token_accuracy": 0.7153716452419758,
"step": 780
},
{
"epoch": 0.4534041179424636,
"grad_norm": 0.29981374740600586,
"learning_rate": 0.0002,
"loss": 1.3467,
"mean_token_accuracy": 0.722964895889163,
"step": 790
},
{
"epoch": 0.4591434105746467,
"grad_norm": 0.25450626015663147,
"learning_rate": 0.0002,
"loss": 1.3928,
"mean_token_accuracy": 0.7192676767706871,
"step": 800
},
{
"epoch": 0.4591434105746467,
"eval_loss": 1.3700028657913208,
"eval_mean_token_accuracy": 0.720344717953647,
"eval_runtime": 5677.4803,
"eval_samples_per_second": 0.614,
"eval_steps_per_second": 0.307,
"step": 800
}
],
"logging_steps": 10,
"max_steps": 3484,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.3950907424144466e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}